Loading VCF assertion error

I am trying to load this VCF file (http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v2.0/integrated_callset/variants_freeze4_sv_insdel_alt.vcf.gz)

With command

ds = hl.import_vcf('/mnt/grid/janowitz/rdata_norepl/sv/pacbio/variants_freeze4_sv_insdel_sym_lilra3.vcf.gz', force_bgz=True, reference_genome='GRCh38')
ds = ds.filter_rows(ds.locus == hl.locus("chr19", 54297003,reference_genome='GRCh38'))
pacbio_entries = ds.entries().to_pandas()

Seeing error:
FatalError: AssertionError: assertion failed

Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 6.0 failed 1 times, most recent failure: Lost task 0.0 in stage 6.0 (TID 5) (bam19.cm.cluster executor driver): java.lang.AssertionError: assertion failed
	at scala.Predef$.assert(Predef.scala:208)
	at is.hail.annotations.RegionValueBuilder.addBoolean(RegionValueBuilder.scala:223)
	at is.hail.io.vcf.VCFLine.parseAddInfo(LoadVCF.scala:973)
	at is.hail.io.vcf.LoadVCF$.parseLine(LoadVCF.scala:1439)
	at is.hail.io.vcf.LoadVCF$.parseLine(LoadVCF.scala:1315)
	at is.hail.io.vcf.MatrixVCFReader.$anonfun$executeGeneric$6(LoadVCF.scala:1753)
	at is.hail.io.vcf.MatrixVCFReader.$anonfun$executeGeneric$6$adapted(LoadVCF.scala:1745)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:513)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at is.hail.io.RichContextRDDLong$$anon$3.hasNext(RichContextRDDRegionValue.scala:197)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at is.hail.expr.ir.TableAggregateByKey$$anon$10.hasNext(TableIR.scala:2616)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at is.hail.io.RichContextRDDLong$$anon$3.hasNext(RichContextRDDRegionValue.scala:197)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at is.hail.rvd.RVD$$anon$1.hasNext(RVD.scala:211)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at is.hail.utils.richUtils.RichContextRDD$$anon$1.hasNext(RichContextRDD.scala:71)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2253)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2202)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2201)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2440)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2382)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2371)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2223)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2242)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2267)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:390)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3519)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3516)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)

java.lang.AssertionError: assertion failed
	at scala.Predef$.assert(Predef.scala:208)
	at is.hail.annotations.RegionValueBuilder.addBoolean(RegionValueBuilder.scala:223)
	at is.hail.io.vcf.VCFLine.parseAddInfo(LoadVCF.scala:973)
	at is.hail.io.vcf.LoadVCF$.parseLine(LoadVCF.scala:1439)
	at is.hail.io.vcf.LoadVCF$.parseLine(LoadVCF.scala:1315)
	at is.hail.io.vcf.MatrixVCFReader.$anonfun$executeGeneric$6(LoadVCF.scala:1753)
	at is.hail.io.vcf.MatrixVCFReader.$anonfun$executeGeneric$6$adapted(LoadVCF.scala:1745)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:513)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at is.hail.io.RichContextRDDLong$$anon$3.hasNext(RichContextRDDRegionValue.scala:197)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at is.hail.expr.ir.TableAggregateByKey$$anon$10.hasNext(TableIR.scala:2616)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at is.hail.io.RichContextRDDLong$$anon$3.hasNext(RichContextRDDRegionValue.scala:197)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at is.hail.rvd.RVD$$anon$1.hasNext(RVD.scala:211)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at is.hail.utils.richUtils.RichContextRDD$$anon$1.hasNext(RichContextRDD.scala:71)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)




Hail version: 0.2.67-40d373134612
Error summary: AssertionError: assertion failed

Can replicate, thank you!

It looks like this is a bad error message from Hail about a violation of the VCF spec – there is an info field REF_TRF of type Flag, which should have Number=0 according to the spec. Here it’s Number=., which broke our importer, because we thought that field would be an array (which doesn’t make sense for flags).

This pull request adds functionality to tolerate this kind of invalid input when importing: