Best strategy for annotating and filtering VCF files using HAIL-VEP on UKB RAP?

2. So when I export to VCF it fails immediately with this error:

> Hail: INFO: Coerced sorted dataset
> ---------------------------------------------------------------------------
> FatalError                                Traceback (most recent call last)
> <ipython-input-11-aa00e6e9af78> in <module>
> ----> 1 hl.export_vcf(annotated_mt,"file:///opt/notebooks/test.annotate.vcf.bgz")
> 
> <decorator-gen-1449> in export_vcf(dataset, output, append_to_header, parallel, metadata, tabix)
> 
> /opt/conda/lib/python3.6/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
>     575     def wrapper(__original_func, *args, **kwargs):
>     576         args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
> --> 577         return __original_func(*args_, **kwargs_)
>     578 
>     579     return wrapper
> 
> /opt/conda/lib/python3.6/site-packages/hail/methods/impex.py in export_vcf(dataset, output, append_to_header, parallel, metadata, tabix)
>     538                                 metadata,
>     539                                 tabix)
> --> 540     Env.backend().execute(ir.MatrixWrite(dataset._mir, writer))
>     541 
>     542 
> 
> /opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
>     108                 raise HailUserError(message_and_trace) from None
>     109 
> --> 110             raise e
> 
> /opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
>      84         # print(self._hail_package.expr.ir.Pretty.apply(jir, True, -1))
>      85         try:
> ---> 86             result_tuple = self._jhc.backend().executeEncode(jir, stream_codec)
>      87             (result, timings) = (result_tuple._1(), result_tuple._2())
>      88             value = ir.typ._from_encoding(result)
> 
> /cluster/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
>    1255         answer = self.gateway_client.send_command(command)
>    1256         return_value = get_return_value(
> -> 1257             answer, self.gateway_client, self.target_id, self.name)
>    1258 
>    1259         for temp_arg in temp_args:
> 
> /opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in deco(*args, **kwargs)
>      29                 raise FatalError('%s\n\nJava stack trace:\n%s\n'
>      30                                  'Hail version: %s\n'
> ---> 31                                  'Error summary: %s' % (deepest, full, hail.__version__, deepest), error_id) from None
>      32         except pyspark.sql.utils.CapturedException as e:
>      33             raise FatalError('%s\n\nJava stack trace:\n%s\n'
> 
> FatalError: VCFParseError: unexpected end of line
> 
> Java stack trace:
> org.apache.spark.SparkException: Job aborted.
> 	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:100)
> 	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1096)
> 	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)
> 	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)
> 	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> 	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> 	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
> 	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1094)
> 	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1067)
> 	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
> 	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
> 	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> 	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> 	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
> 	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1032)
> 	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$3.apply$mcV$sp(PairRDDFunctions.scala:1013)
> 	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$3.apply(PairRDDFunctions.scala:1013)
> 	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$3.apply(PairRDDFunctions.scala:1013)
> 	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> 	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> 	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
> 	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1012)
> 	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$2.apply$mcV$sp(PairRDDFunctions.scala:970)
> 	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$2.apply(PairRDDFunctions.scala:968)
> 	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$2.apply(PairRDDFunctions.scala:968)
> 	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> 	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> 	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
> 	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:968)
> 	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$2.apply$mcV$sp(RDD.scala:1517)
> 	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$2.apply(RDD.scala:1505)
> 	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$2.apply(RDD.scala:1505)
> 	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> 	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> 	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
> 	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1505)
> 	at is.hail.utils.richUtils.RichRDD$.writeTable$extension(RichRDD.scala:78)
> 	at is.hail.io.vcf.ExportVCF$.apply(ExportVCF.scala:460)
> 	at is.hail.expr.ir.MatrixVCFWriter.apply(MatrixWriter.scala:333)
> 	at is.hail.expr.ir.WrappedMatrixWriter.apply(MatrixWriter.scala:46)
> 	at is.hail.expr.ir.Interpret$.run(Interpret.scala:852)
> 	at is.hail.expr.ir.Interpret$.alreadyLowered(Interpret.scala:57)
> 	at is.hail.expr.ir.LowerOrInterpretNonCompilable$.evaluate$1(LowerOrInterpretNonCompilable.scala:20)
> 	at is.hail.expr.ir.LowerOrInterpretNonCompilable$.is$hail$expr$ir$LowerOrInterpretNonCompilable$$rewrite$1(LowerOrInterpretNonCompilable.scala:67)
> 	at is.hail.expr.ir.LowerOrInterpretNonCompilable$.apply(LowerOrInterpretNonCompilable.scala:72)
> 	at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.transform(LoweringPass.scala:69)
> 	at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:16)
> 	at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:16)
> 	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
> 	at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:16)
> 	at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:14)
> 	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
> 	at is.hail.expr.ir.lowering.LoweringPass$class.apply(LoweringPass.scala:14)
> 	at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.apply(LoweringPass.scala:64)
> 	at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:15)
> 	at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:13)
> 	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
> 	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
> 	at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:13)
> 	at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:47)
> 	at is.hail.backend.spark.SparkBackend.is$hail$backend$spark$SparkBackend$$_execute(SparkBackend.scala:381)
> 	at is.hail.backend.spark.SparkBackend$$anonfun$8$$anonfun$apply$4.apply(SparkBackend.scala:417)
> 	at is.hail.backend.spark.SparkBackend$$anonfun$8$$anonfun$apply$4.apply(SparkBackend.scala:414)
> 	at is.hail.backend.ExecuteContext$$anonfun$scoped$1$$anonfun$apply$1.apply(ExecuteContext.scala:47)
> 	at is.hail.backend.ExecuteContext$$anonfun$scoped$1$$anonfun$apply$1.apply(ExecuteContext.scala:47)
> 	at is.hail.utils.package$.using(package.scala:638)
> 	at is.hail.backend.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:47)
> 	at is.hail.backend.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:46)
> 	at is.hail.utils.package$.using(package.scala:638)
> 	at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
> 	at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:46)
> 	at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:275)
> 	at is.hail.backend.spark.SparkBackend$$anonfun$8.apply(SparkBackend.scala:414)
> 	at is.hail.backend.spark.SparkBackend$$anonfun$8.apply(SparkBackend.scala:413)
> 	at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
> 	at is.hail.backend.spark.SparkBackend.executeEncode(SparkBackend.scala:413)
> 	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> 	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> 	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> 	at java.lang.reflect.Method.invoke(Method.java:498)
> 	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
> 	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
> 	at py4j.Gateway.invoke(Gateway.java:282)
> 	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
> 	at py4j.commands.CallCommand.execute(CallCommand.java:79)
> 	at py4j.GatewayConnection.run(GatewayConnection.java:238)
> 	at java.lang.Thread.run(Thread.java:748)
> 
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 41.0 failed 4 times, most recent failure: Lost task 0.3 in stage 41.0 (TID 185, ip-10-60-8-117.eu-west-2.compute.internal, executor 0): org.apache.spark.SparkException: Task failed while writing rows
> 	at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:157)
> 	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:83)
> 	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:78)
> 	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
> 	at org.apache.spark.scheduler.Task.run(Task.scala:123)
> 	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
> 	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
> 	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
> 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> 	at java.lang.Thread.run(Thread.java:748)
> .
> .
> .
> .
> .
> at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:15)
> 	at is.hail.utils.package$.fatal(package.scala:78)
> 	at is.hail.io.vcf.MatrixVCFReader$$anonfun$21$$anonfun$apply$11$$anonfun$apply$12.apply(LoadVCF.scala:1758)
> 	at is.hail.io.vcf.MatrixVCFReader$$anonfun$21$$anonfun$apply$11$$anonfun$apply$12.apply(LoadVCF.scala:1747)
> 	at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:464)
> 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> 	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
> 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> 	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
> 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> 	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
> 	at is.hail.utils.richUtils.RichContextRDD$$anonfun$cleanupRegions$1$$anon$1.hasNext(RichContextRDD.scala:69)
> 	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
> 	at scala.collection.Iterator$JoinIterator.hasNext(Iterator.scala:220)
> 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> 	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:130)
> 	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:129)
> 	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1394)
> 	at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:141)
> 	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:83)
> 	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:78)
> 	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
> 	at org.apache.spark.scheduler.Task.run(Task.scala:123)
> 	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
> 	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
> 	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
> 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> 	at java.lang.Thread.run(Thread.java:748)
> 
> is.hail.io.vcf.VCFParseError: unexpected end of line
> 	at is.hail.io.vcf.VCFLine.parseError(LoadVCF.scala:57)
> 	at is.hail.io.vcf.VCFLine.nextField(LoadVCF.scala:294)
> 	at is.hail.io.vcf.LoadVCF$.parseLine(LoadVCF.scala:1463)
> 	at is.hail.io.vcf.LoadVCF$.parseLine(LoadVCF.scala:1317)
> 	at is.hail.io.vcf.MatrixVCFReader$$anonfun$21$$anonfun$apply$11$$anonfun$apply$12.apply(LoadVCF.scala:1754)
> 	at is.hail.io.vcf.MatrixVCFReader$$anonfun$21$$anonfun$apply$11$$anonfun$apply$12.apply(LoadVCF.scala:1747)
> 	at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:464)
> 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> 	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
> 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> 	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
> 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> 	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
> 	at is.hail.utils.richUtils.RichContextRDD$$anonfun$cleanupRegions$1$$anon$1.hasNext(RichContextRDD.scala:69)
> 	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
> 	at scala.collection.Iterator$JoinIterator.hasNext(Iterator.scala:220)
> 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> 	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:130)
> 	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:129)
> 	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1394)
> 	at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:141)
> 	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:83)
> 	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:78)
> 	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
> 	at org.apache.spark.scheduler.Task.run(Task.scala:123)
> 	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
> 	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
> 	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
> 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> 	at java.lang.Thread.run(Thread.java:748)
> 
> Hail version: 0.2.78-b17627756568
> Error summary: VCFParseError: unexpected end of line
  1. So I subsetted with bcftools and then qc’d using vcftools. The format of the outputted vcf file is:

chr1 69026 chr1_69026_T_G T G 38 . AF=1e-06;AQ=38;AC=1;AN=226780 GT:DP:AD:GQ:PL:RNC

The export to VCF fails regardless of whether I annotate or not, so I think you are right that it’s something wrong with the input but I don’t understand where I went wrong considering I processed the vcf files using standard processes.