Unable to create matrix table of gnomAD chr1, chr2

Hi!

I have gnomAD data locally, and I’m trying to convert the vcf.bgz files of Chromosome 1 and Chromosome 2 to matrix table format, using this code:

hl.import_vcf('/cs/prt3/gnomAD_data/chr_vcf/gnomad.genomes.v3.1.2.sites.chr2.vcf.bgz', force_bgz=False, reference_genome=REF_GENOME).write('/cs/prt3/gnomAD_data/gnomAD_mt/chr2.mt')

But, I’m getting this error:

---------------------------------------------------------------------------
FatalError                                Traceback (most recent call last)
/tmp/ipykernel_26631/2126931732.py in <module>
----> 1 hl.import_vcf('/cs/prt3/gnomAD_data/chr_vcf/gnomad.genomes.v3.1.2.sites.chr2.vcf.bgz', force_bgz=False, reference_genome=REF_GENOME).write('/cs/prt3/gnomAD_data/gnomAD_mt/chr2.mt')

<decorator-gen-1270> in write(self, output, overwrite, stage_locally, _codec_spec, _partitions, _checkpoint_file)

/cs/labs/michall/ofer.feinstein/my_env/lib/python3.7/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
    575     def wrapper(__original_func, *args, **kwargs):
    576         args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 577         return __original_func(*args_, **kwargs_)
    578 
    579     return wrapper

/cs/labs/michall/ofer.feinstein/my_env/lib/python3.7/site-packages/hail/matrixtable.py in write(self, output, overwrite, stage_locally, _codec_spec, _partitions, _checkpoint_file)
   2554 
   2555         writer = ir.MatrixNativeWriter(output, overwrite, stage_locally, _codec_spec, _partitions, _partitions_type, _checkpoint_file)
-> 2556         Env.backend().execute(ir.MatrixWrite(self._mir, writer))
   2557 
   2558     class _Show:

/cs/labs/michall/ofer.feinstein/my_env/lib/python3.7/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
    102             return (value, timings) if timed else value
    103         except FatalError as e:
--> 104             self._handle_fatal_error_from_backend(e, ir)
    105 
    106     async def _async_execute(self, ir, timed=False):

/cs/labs/michall/ofer.feinstein/my_env/lib/python3.7/site-packages/hail/backend/backend.py in _handle_fatal_error_from_backend(self, err, ir)
    179         error_sources = ir.base_search(lambda x: x._error_id == err._error_id)
    180         if len(error_sources) == 0:
--> 181             raise err
    182 
    183         better_stack_trace = error_sources[0]._stack_trace

/cs/labs/michall/ofer.feinstein/my_env/lib/python3.7/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
     96         # print(self._hail_package.expr.ir.Pretty.apply(jir, True, -1))
     97         try:
---> 98             result_tuple = self._jbackend.executeEncode(jir, stream_codec)
     99             (result, timings) = (result_tuple._1(), result_tuple._2())
    100             value = ir.typ._from_encoding(result)

/cs/labs/michall/ofer.feinstein/my_env/lib/python3.7/site-packages/py4j/java_gateway.py in __call__(self, *args)
   1303         answer = self.gateway_client.send_command(command)
   1304         return_value = get_return_value(
-> 1305             answer, self.gateway_client, self.target_id, self.name)
   1306 
   1307         for temp_arg in temp_args:

/cs/labs/michall/ofer.feinstein/my_env/lib/python3.7/site-packages/hail/backend/py4j_backend.py in deco(*args, **kwargs)
     29             tpl = Env.jutils().handleForPython(e.java_exception)
     30             deepest, full, error_id = tpl._1(), tpl._2(), tpl._3()
---> 31             raise fatal_error_from_java_error_triplet(deepest, full, error_id) from None
     32         except pyspark.sql.utils.CapturedException as e:
     33             raise FatalError('%s\n\nJava stack trace:\n%s\n'

FatalError: ZipException: File does not conform to block gzip format.

Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 1387 in stage 1.0 failed 1 times, most recent failure: Lost task 1387.0 in stage 1.0 (TID 2775) (protonew-1.cs.huji.ac.il executor driver): java.util.zip.ZipException: File does not conform to block gzip format.
	at is.hail.io.compress.BGzipInputStream$BGzipHeader.<init>(BGzipInputStream.java:73)
	at is.hail.io.compress.BGzipInputStream.decompressNextBlock(BGzipInputStream.java:150)
	at is.hail.io.compress.BGzipInputStream.read(BGzipInputStream.java:216)
	at java.base/java.io.InputStream.read(InputStream.java:205)
	at is.hail.relocated.org.apache.commons.io.input.ProxyInputStream.read(ProxyInputStream.java:79)
	at is.hail.expr.ir.GenericLines$$anon$1.loadBuffer(GenericLines.scala:72)
	at is.hail.expr.ir.GenericLines$$anon$1.readLine(GenericLines.scala:182)
	at is.hail.expr.ir.GenericLines$$anon$1.hasNext(GenericLines.scala:202)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:511)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at __C452collect_distributed_array.__m537split_StreamFor_region6_21(Unknown Source)
	at __C452collect_distributed_array.__m537split_StreamFor(Unknown Source)
	at __C452collect_distributed_array.__m535begin_group_0(Unknown Source)
	at __C452collect_distributed_array.__m463split_Let(Unknown Source)
	at __C452collect_distributed_array.apply(Unknown Source)
	at __C452collect_distributed_array.apply(Unknown Source)
	at is.hail.backend.BackendUtils.$anonfun$collectDArray$4(BackendUtils.scala:40)
	at is.hail.utils.package$.using(package.scala:640)
	at is.hail.annotations.RegionPool.scopedRegion(RegionPool.scala:162)
	at is.hail.backend.BackendUtils.$anonfun$collectDArray$3(BackendUtils.scala:39)
	at is.hail.backend.spark.SparkBackendComputeRDD.compute(SparkBackend.scala:761)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:498)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:501)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2303)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2252)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2251)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2251)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1124)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1124)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1124)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2490)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2432)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2421)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:902)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2261)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
	at is.hail.backend.spark.SparkBackend.parallelizeAndComputeWithIndex(SparkBackend.scala:321)
	at is.hail.backend.BackendUtils.collectDArray(BackendUtils.scala:37)
	at __C440Compiled.apply(Emit.scala)
	at is.hail.expr.ir.LoweredTableReader$.makeCoercer(TableIR.scala:318)
	at is.hail.expr.ir.GenericTableValue.getLTVCoercer(GenericTableValue.scala:134)
	at is.hail.expr.ir.GenericTableValue.toTableStage(GenericTableValue.scala:159)
	at is.hail.io.vcf.MatrixVCFReader.lower(LoadVCF.scala:1798)
	at is.hail.expr.ir.lowering.LowerTableIR$.applyTable(LowerTableIR.scala:706)
	at is.hail.expr.ir.lowering.LowerTableIR$.lower$1(LowerTableIR.scala:458)
	at is.hail.expr.ir.lowering.LowerTableIR$.apply(LowerTableIR.scala:672)
	at is.hail.expr.ir.lowering.LowerToCDA$.lower(LowerToCDA.scala:69)
	at is.hail.expr.ir.lowering.LowerToCDA$.apply(LowerToCDA.scala:18)
	at is.hail.expr.ir.lowering.LowerToDistributedArrayPass.transform(LoweringPass.scala:77)
	at is.hail.expr.ir.LowerOrInterpretNonCompilable$.evaluate$1(LowerOrInterpretNonCompilable.scala:27)
	at is.hail.expr.ir.LowerOrInterpretNonCompilable$.rewrite$1(LowerOrInterpretNonCompilable.scala:67)
	at is.hail.expr.ir.LowerOrInterpretNonCompilable$.apply(LowerOrInterpretNonCompilable.scala:72)
	at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.transform(LoweringPass.scala:69)
	at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$3(LoweringPass.scala:16)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
	at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$1(LoweringPass.scala:16)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
	at is.hail.expr.ir.lowering.LoweringPass.apply(LoweringPass.scala:14)
	at is.hail.expr.ir.lowering.LoweringPass.apply$(LoweringPass.scala:13)
	at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.apply(LoweringPass.scala:64)
	at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1(LoweringPipeline.scala:15)
	at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1$adapted(LoweringPipeline.scala:13)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:13)
	at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:47)
	at is.hail.backend.spark.SparkBackend._execute(SparkBackend.scala:416)
	at is.hail.backend.spark.SparkBackend.$anonfun$executeEncode$2(SparkBackend.scala:452)
	at is.hail.backend.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:69)
	at is.hail.utils.package$.using(package.scala:640)
	at is.hail.backend.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:69)
	at is.hail.utils.package$.using(package.scala:640)
	at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
	at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:58)
	at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:310)
	at is.hail.backend.spark.SparkBackend.$anonfun$executeEncode$1(SparkBackend.scala:449)
	at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
	at is.hail.backend.spark.SparkBackend.executeEncode(SparkBackend.scala:448)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:834)

java.util.zip.ZipException: File does not conform to block gzip format.
	at is.hail.io.compress.BGzipInputStream$BGzipHeader.<init>(BGzipInputStream.java:73)
	at is.hail.io.compress.BGzipInputStream.decompressNextBlock(BGzipInputStream.java:150)
	at is.hail.io.compress.BGzipInputStream.read(BGzipInputStream.java:216)
	at java.base/java.io.InputStream.read(InputStream.java:205)
	at is.hail.relocated.org.apache.commons.io.input.ProxyInputStream.read(ProxyInputStream.java:79)
	at is.hail.expr.ir.GenericLines$$anon$1.loadBuffer(GenericLines.scala:72)
	at is.hail.expr.ir.GenericLines$$anon$1.readLine(GenericLines.scala:182)
	at is.hail.expr.ir.GenericLines$$anon$1.hasNext(GenericLines.scala:202)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:511)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at __C452collect_distributed_array.__m537split_StreamFor_region6_21(Unknown Source)
	at __C452collect_distributed_array.__m537split_StreamFor(Unknown Source)
	at __C452collect_distributed_array.__m535begin_group_0(Unknown Source)
	at __C452collect_distributed_array.__m463split_Let(Unknown Source)
	at __C452collect_distributed_array.apply(Unknown Source)
	at __C452collect_distributed_array.apply(Unknown Source)
	at is.hail.backend.BackendUtils.$anonfun$collectDArray$4(BackendUtils.scala:40)
	at is.hail.utils.package$.using(package.scala:640)
	at is.hail.annotations.RegionPool.scopedRegion(RegionPool.scala:162)
	at is.hail.backend.BackendUtils.$anonfun$collectDArray$3(BackendUtils.scala:39)
	at is.hail.backend.spark.SparkBackendComputeRDD.compute(SparkBackend.scala:761)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:498)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:501)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)




Hail version: 0.2.95-513139587f57
Error summary: ZipException: File does not conform to block gzip format.

I have successfully converted the other chromosomes.

Thanks for your help,
Ofer

Hi,

Does anybody know how to deal with know this problem?

This might be a corrupt file – where did it come from? Can you try to tabix it to verify that it is correctly encoded?