Hi @tpoterba,
Thanks for helping me this far. I have an error even after loading the hs37d5. Can you please check this and let me know.
>>> rg = l.get_reference(‘GRCh37’) # doctest: +SKIP
>>> rg.add_sequence(‘hs37d5.fa.gz’,‘hs37d5.fa.gz.fai’) # doctest: +SKIP
>>> rgts=rg.add_sequence(‘hs37d5.fa.gz’,‘hs37d5.fa.gz.fai’) # doctest: +SKIP
Traceback (most recent call last):
File “”, line 1, in
File “</root/anaconda3/envs/hail/lib/python3.6/site-packages/decorator.py:decorator-gen-34>”, line 2, in add_sequence
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/typecheck/check.py”, line 560, in wrapper
return original_func(*args, **kwargs)
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/genetics/reference_genome.py”, line 338, in add_sequence
self._jrep.addSequence(Env.hc()._jhc, fasta_file, index_file)
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/py4j/java_gateway.py”, line 1257, in call
answer, self.gateway_client, self.target_id, self.name)
File “/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/utils/java.py”, line 227, in deco
‘Error summary: %s’ % (deepest, full, hail.version, deepest)) from None
hail.utils.java.FatalError: HailException: FASTA sequence has already been loaded for reference genome `GRCh37’.
Java stack trace:
is.hail.utils.HailException: FASTA sequence has already been loaded for reference genome `GRCh37'.
at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:9)
at is.hail.utils.package$.fatal(package.scala:26)
at is.hail.variant.ReferenceGenome.addSequence(ReferenceGenome.scala:314)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Hail version: 0.2.8-70304a52d33d
Error summary: HailException: FASTA sequence has already been loaded for reference genome `GRCh37'.
>>> mt = l.import_vcf('s3a://xxxxxxxxxxxxx/xxxxxxx-001242_p95a28_xxxxx.vcf',
... force_bgz=True,
... min_partitions=10000,
... drop_samples=False,
... reference_genome='GRCh37',
... skip_invalid_loci=False).write('s3a://xxxxxxxxxxxxx/raw.mt', overwrite=True)
[Stage 1:==================================================> (1005 + 2) / 1059]Traceback (most recent call last):
File "<stdin>", line 6, in <module>
File "</root/anaconda3/envs/hail/lib/python3.6/site-packages/decorator.py:decorator-gen-824>", line 2, in write
File "/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/typecheck/check.py", line 560, in wrapper
return __original_func(*args_, **kwargs_)
File "/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/matrixtable.py", line 2193, in write
Env.backend().execute(MatrixWrite(self._mir, writer))
File "/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/backend/backend.py", line 44, in execute
self._to_java_ir(ir)))
File "/root/anaconda3/envs/hail/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/root/anaconda3/envs/hail/lib/python3.6/site-packages/hail/utils/java.py", line 227, in deco
'Error summary: %s' % (deepest, full, hail.__version__, deepest)) from None
hail.utils.java.FatalError: HailException: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 1006 in stage 1.0 failed 1 times, most recent failure: Lost task 1006.0 in stage 1.0 (TID 1007, localhost, executor driver): is.hail.utils.HailException: xxxxxxxxx-001242_p95a28_xxxxxxxxx.vcf: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
offending line: hs37d5 243520 . A G 74.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=...
at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:20)
at is.hail.utils.package$.fatal(package.scala:26)
at is.hail.utils.Context.wrapException(Context.scala:19)
at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:854)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1039)
at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1038)
at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at scala.collection.AbstractIterator.to(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:344)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: is.hail.utils.HailException: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:9)
at is.hail.utils.package$.fatal(package.scala:26)
at is.hail.variant.ReferenceGenome.checkLocus(ReferenceGenome.scala:218)
at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
at scala.Option.foreach(Option.scala:257)
at is.hail.io.vcf.VCFLine.parseAddVariant(LoadVCF.scala:235)
at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:828)
... 30 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1575)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1563)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1562)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1562)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:803)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:803)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:803)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1790)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1745)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1734)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:619)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2094)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:944)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.collect(RDD.scala:943)
at is.hail.sparkextras.ContextRDD.collect(ContextRDD.scala:153)
at is.hail.rvd.RVD$.getKeyInfo(RVD.scala:1044)
at is.hail.rvd.RVD$.makeCoercer(RVD.scala:1108)
at is.hail.io.vcf.MatrixVCFReader.coercer$lzycompute(LoadVCF.scala:1097)
at is.hail.io.vcf.MatrixVCFReader.coercer(LoadVCF.scala:1097)
at is.hail.io.vcf.MatrixVCFReader.apply(LoadVCF.scala:1126)
at is.hail.expr.ir.MatrixRead.execute(MatrixIR.scala:359)
at is.hail.expr.ir.Interpret$.apply(Interpret.scala:754)
at is.hail.expr.ir.Interpret$.apply(Interpret.scala:93)
at is.hail.expr.ir.Interpret$.apply(Interpret.scala:63)
at is.hail.expr.ir.Interpret$.interpretJSON(Interpret.scala:22)
at is.hail.expr.ir.Interpret.interpretJSON(Interpret.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
is.hail.utils.HailException: MCW2018-001242_p95a28_SMT4.vcf: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
offending line: hs37d5 243520 . A G 74.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=...
at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:20)
at is.hail.utils.package$.fatal(package.scala:26)
at is.hail.utils.Context.wrapException(Context.scala:19)
at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:854)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1039)
at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1038)
at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at scala.collection.AbstractIterator.to(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:344)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
is.hail.utils.HailException: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.
at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:9)
at is.hail.utils.package$.fatal(package.scala:26)
at is.hail.variant.ReferenceGenome.checkLocus(ReferenceGenome.scala:218)
at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
at is.hail.io.vcf.VCFLine$$anonfun$parseAddVariant$2.apply(LoadVCF.scala:235)
at scala.Option.foreach(Option.scala:257)
at is.hail.io.vcf.VCFLine.parseAddVariant(LoadVCF.scala:235)
at is.hail.io.vcf.LoadVCF$$anonfun$parseLines$1$$anon$1.hasNext(LoadVCF.scala:828)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:438)
at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1039)
at is.hail.rvd.RVD$$anonfun$36.apply(RVD.scala:1038)
at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$30.apply(ContextRDD.scala:369)
at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:149)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at scala.collection.AbstractIterator.to(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:944)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:344)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Hail version: 0.2.8-70304a52d33d
Error summary: HailException: Invalid locus `hs37d5:243520' found. Contig `hs37d5' is not in the reference genome `GRCh37'.