2. So when I export to VCF it fails immediately with this error:
> Hail: INFO: Coerced sorted dataset
> ---------------------------------------------------------------------------
> FatalError Traceback (most recent call last)
> <ipython-input-11-aa00e6e9af78> in <module>
> ----> 1 hl.export_vcf(annotated_mt,"file:///opt/notebooks/test.annotate.vcf.bgz")
>
> <decorator-gen-1449> in export_vcf(dataset, output, append_to_header, parallel, metadata, tabix)
>
> /opt/conda/lib/python3.6/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
> 575 def wrapper(__original_func, *args, **kwargs):
> 576 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
> --> 577 return __original_func(*args_, **kwargs_)
> 578
> 579 return wrapper
>
> /opt/conda/lib/python3.6/site-packages/hail/methods/impex.py in export_vcf(dataset, output, append_to_header, parallel, metadata, tabix)
> 538 metadata,
> 539 tabix)
> --> 540 Env.backend().execute(ir.MatrixWrite(dataset._mir, writer))
> 541
> 542
>
> /opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
> 108 raise HailUserError(message_and_trace) from None
> 109
> --> 110 raise e
>
> /opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
> 84 # print(self._hail_package.expr.ir.Pretty.apply(jir, True, -1))
> 85 try:
> ---> 86 result_tuple = self._jhc.backend().executeEncode(jir, stream_codec)
> 87 (result, timings) = (result_tuple._1(), result_tuple._2())
> 88 value = ir.typ._from_encoding(result)
>
> /cluster/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
> 1255 answer = self.gateway_client.send_command(command)
> 1256 return_value = get_return_value(
> -> 1257 answer, self.gateway_client, self.target_id, self.name)
> 1258
> 1259 for temp_arg in temp_args:
>
> /opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in deco(*args, **kwargs)
> 29 raise FatalError('%s\n\nJava stack trace:\n%s\n'
> 30 'Hail version: %s\n'
> ---> 31 'Error summary: %s' % (deepest, full, hail.__version__, deepest), error_id) from None
> 32 except pyspark.sql.utils.CapturedException as e:
> 33 raise FatalError('%s\n\nJava stack trace:\n%s\n'
>
> FatalError: VCFParseError: unexpected end of line
>
> Java stack trace:
> org.apache.spark.SparkException: Job aborted.
> at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:100)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1096)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1094)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1067)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1032)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$3.apply$mcV$sp(PairRDDFunctions.scala:1013)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$3.apply(PairRDDFunctions.scala:1013)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$3.apply(PairRDDFunctions.scala:1013)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1012)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$2.apply$mcV$sp(PairRDDFunctions.scala:970)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$2.apply(PairRDDFunctions.scala:968)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$2.apply(PairRDDFunctions.scala:968)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:968)
> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$2.apply$mcV$sp(RDD.scala:1517)
> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$2.apply(RDD.scala:1505)
> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$2.apply(RDD.scala:1505)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
> at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1505)
> at is.hail.utils.richUtils.RichRDD$.writeTable$extension(RichRDD.scala:78)
> at is.hail.io.vcf.ExportVCF$.apply(ExportVCF.scala:460)
> at is.hail.expr.ir.MatrixVCFWriter.apply(MatrixWriter.scala:333)
> at is.hail.expr.ir.WrappedMatrixWriter.apply(MatrixWriter.scala:46)
> at is.hail.expr.ir.Interpret$.run(Interpret.scala:852)
> at is.hail.expr.ir.Interpret$.alreadyLowered(Interpret.scala:57)
> at is.hail.expr.ir.LowerOrInterpretNonCompilable$.evaluate$1(LowerOrInterpretNonCompilable.scala:20)
> at is.hail.expr.ir.LowerOrInterpretNonCompilable$.is$hail$expr$ir$LowerOrInterpretNonCompilable$$rewrite$1(LowerOrInterpretNonCompilable.scala:67)
> at is.hail.expr.ir.LowerOrInterpretNonCompilable$.apply(LowerOrInterpretNonCompilable.scala:72)
> at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.transform(LoweringPass.scala:69)
> at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:16)
> at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:16)
> at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
> at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:16)
> at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:14)
> at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
> at is.hail.expr.ir.lowering.LoweringPass$class.apply(LoweringPass.scala:14)
> at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.apply(LoweringPass.scala:64)
> at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:15)
> at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:13)
> at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
> at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
> at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:13)
> at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:47)
> at is.hail.backend.spark.SparkBackend.is$hail$backend$spark$SparkBackend$$_execute(SparkBackend.scala:381)
> at is.hail.backend.spark.SparkBackend$$anonfun$8$$anonfun$apply$4.apply(SparkBackend.scala:417)
> at is.hail.backend.spark.SparkBackend$$anonfun$8$$anonfun$apply$4.apply(SparkBackend.scala:414)
> at is.hail.backend.ExecuteContext$$anonfun$scoped$1$$anonfun$apply$1.apply(ExecuteContext.scala:47)
> at is.hail.backend.ExecuteContext$$anonfun$scoped$1$$anonfun$apply$1.apply(ExecuteContext.scala:47)
> at is.hail.utils.package$.using(package.scala:638)
> at is.hail.backend.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:47)
> at is.hail.backend.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:46)
> at is.hail.utils.package$.using(package.scala:638)
> at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
> at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:46)
> at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:275)
> at is.hail.backend.spark.SparkBackend$$anonfun$8.apply(SparkBackend.scala:414)
> at is.hail.backend.spark.SparkBackend$$anonfun$8.apply(SparkBackend.scala:413)
> at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
> at is.hail.backend.spark.SparkBackend.executeEncode(SparkBackend.scala:413)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:498)
> at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
> at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
> at py4j.Gateway.invoke(Gateway.java:282)
> at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
> at py4j.commands.CallCommand.execute(CallCommand.java:79)
> at py4j.GatewayConnection.run(GatewayConnection.java:238)
> at java.lang.Thread.run(Thread.java:748)
>
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 41.0 failed 4 times, most recent failure: Lost task 0.3 in stage 41.0 (TID 185, ip-10-60-8-117.eu-west-2.compute.internal, executor 0): org.apache.spark.SparkException: Task failed while writing rows
> at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:157)
> at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:83)
> at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:78)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
> at org.apache.spark.scheduler.Task.run(Task.scala:123)
> at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> .
> .
> .
> .
> .
> at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:15)
> at is.hail.utils.package$.fatal(package.scala:78)
> at is.hail.io.vcf.MatrixVCFReader$$anonfun$21$$anonfun$apply$11$$anonfun$apply$12.apply(LoadVCF.scala:1758)
> at is.hail.io.vcf.MatrixVCFReader$$anonfun$21$$anonfun$apply$11$$anonfun$apply$12.apply(LoadVCF.scala:1747)
> at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:464)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
> at is.hail.utils.richUtils.RichContextRDD$$anonfun$cleanupRegions$1$$anon$1.hasNext(RichContextRDD.scala:69)
> at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
> at scala.collection.Iterator$JoinIterator.hasNext(Iterator.scala:220)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:130)
> at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:129)
> at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1394)
> at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:141)
> at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:83)
> at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:78)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
> at org.apache.spark.scheduler.Task.run(Task.scala:123)
> at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
>
> is.hail.io.vcf.VCFParseError: unexpected end of line
> at is.hail.io.vcf.VCFLine.parseError(LoadVCF.scala:57)
> at is.hail.io.vcf.VCFLine.nextField(LoadVCF.scala:294)
> at is.hail.io.vcf.LoadVCF$.parseLine(LoadVCF.scala:1463)
> at is.hail.io.vcf.LoadVCF$.parseLine(LoadVCF.scala:1317)
> at is.hail.io.vcf.MatrixVCFReader$$anonfun$21$$anonfun$apply$11$$anonfun$apply$12.apply(LoadVCF.scala:1754)
> at is.hail.io.vcf.MatrixVCFReader$$anonfun$21$$anonfun$apply$11$$anonfun$apply$12.apply(LoadVCF.scala:1747)
> at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:464)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
> at is.hail.utils.richUtils.RichContextRDD$$anonfun$cleanupRegions$1$$anon$1.hasNext(RichContextRDD.scala:69)
> at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
> at scala.collection.Iterator$JoinIterator.hasNext(Iterator.scala:220)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:130)
> at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:129)
> at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1394)
> at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:141)
> at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:83)
> at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:78)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
> at org.apache.spark.scheduler.Task.run(Task.scala:123)
> at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
>
> Hail version: 0.2.78-b17627756568
> Error summary: VCFParseError: unexpected end of line
- So I subsetted with bcftools and then qc’d using vcftools. The format of the outputted vcf file is:
chr1 69026 chr1_69026_T_G T G 38 . AF=1e-06;AQ=38;AC=1;AN=226780 GT:DP:AD:GQ:PL:RNC
The export to VCF fails regardless of whether I annotate or not, so I think you are right that it’s something wrong with the input but I don’t understand where I went wrong considering I processed the vcf files using standard processes.