Fail to retrieve row information of Hail matrix.table

Hi,

I did some sample and variants QC in All of Us using Hail, and write the filtered table as a new matrix.table. I tried to read the new filtered table, but encountered issue of retrieving variant level information. For example,

mt_wgs_filtered.show()    # won't work
mt_wgs_filtered.sample_qc.call_rate.show()  # works
mt_wgs_filtered.variant_qc.call_rate.show()  # won't work

The error message looks like:

FatalError: HailException: Premature end of file: expected 4 bytes, found 0

Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 39 in stage 11.0 failed 4 times, most recent failure: Lost task 39.3 in stage 11.0 (TID 250) (all-of-us-2936-sw-t5dl.c.terra-vpc-sc-c336a349.internal executor 3): is.hail.utils.HailException: Premature end of file: expected 4 bytes, found 0
	at is.hail.utils.ErrorHandling.fatal(ErrorHandling.scala:11)
	at is.hail.utils.ErrorHandling.fatal$(ErrorHandling.scala:11)
	at is.hail.utils.package$.fatal(package.scala:78)
	at is.hail.utils.richUtils.RichInputStream$.readFully$extension1(RichInputStream.scala:13)
	at is.hail.io.StreamBlockInputBuffer.readBlock(InputBuffers.scala:546)
	at is.hail.io.LZ4InputBlockBuffer.readBlock(InputBuffers.scala:584)
	at is.hail.io.BlockingInputBuffer.readBlock(InputBuffers.scala:382)
	at is.hail.io.BlockingInputBuffer.ensure(InputBuffers.scala:388)
	at is.hail.io.BlockingInputBuffer.readByte(InputBuffers.scala:405)
	at is.hail.io.LEB128InputBuffer.readByte(InputBuffers.scala:217)
	at is.hail.io.LEB128InputBuffer.readInt(InputBuffers.scala:223)
	at __C735collect_distributed_array.__m752SKIP_o_binary(Unknown Source)
	at __C735collect_distributed_array.__m774SKIP_r_struct_of_o_binaryANDo_int32ANDo_int32ANDo_int32END(Unknown Source)
	at __C735collect_distributed_array.__m773SKIP_r_array_of_r_struct_of_o_binaryANDo_int32ANDo_int32ANDo_int32END(Unknown Source)
	at __C735collect_distributed_array.__m772DECODE_r_struct_of_r_array_of_r_struct_of_o_binaryANDo_int32ANDo_int32ANDo_int32ENDEND_TO_SBaseStructPointer(Unknown Source)
	at __C735collect_distributed_array.__m743split_StreamLen(Unknown Source)
	at __C735collect_distributed_array.apply(Unknown Source)
	at __C735collect_distributed_array.apply(Unknown Source)
	at is.hail.backend.BackendUtils.$anonfun$collectDArray$2(BackendUtils.scala:31)
	at is.hail.utils.package$.using(package.scala:638)
	at is.hail.annotations.RegionPool.scopedRegion(RegionPool.scala:162)
	at is.hail.backend.BackendUtils.$anonfun$collectDArray$1(BackendUtils.scala:30)
	at is.hail.backend.spark.SparkBackendComputeRDD.compute(SparkBackend.scala:728)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2259)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2208)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2207)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2446)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2388)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2377)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2204)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2225)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2244)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2269)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
	at is.hail.backend.spark.SparkBackend.parallelizeAndComputeWithIndex(SparkBackend.scala:288)
	at is.hail.backend.BackendUtils.collectDArray(BackendUtils.scala:28)
	at __C715Compiled.__m718split_TailLoop(Emit.scala)
	at __C715Compiled.__m716split_ToArray(Emit.scala)
	at __C715Compiled.apply(Emit.scala)
	at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$_apply$6(CompileAndEvaluate.scala:68)
	at scala.runtime.java8.JFunction0$mcJ$sp.apply(JFunction0$mcJ$sp.java:23)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
	at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:68)
	at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$apply$1(CompileAndEvaluate.scala:19)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
	at is.hail.expr.ir.CompileAndEvaluate$.apply(CompileAndEvaluate.scala:19)
	at is.hail.expr.ir.lowering.LowerTableIR$.applyTable(LowerTableIR.scala:957)
	at is.hail.expr.ir.lowering.LowerTableIR$.lower$2(LowerTableIR.scala:620)
	at is.hail.expr.ir.lowering.LowerTableIR$.applyTable(LowerTableIR.scala:1076)
	at is.hail.expr.ir.lowering.LowerTableIR$.lower$1(LowerTableIR.scala:453)
	at is.hail.expr.ir.lowering.LowerTableIR$.apply(LowerTableIR.scala:471)
	at is.hail.expr.ir.lowering.LowerToCDA$.lower(LowerToCDA.scala:69)
	at is.hail.expr.ir.lowering.LowerToCDA$.apply(LowerToCDA.scala:18)
	at is.hail.expr.ir.lowering.LowerToDistributedArrayPass.transform(LoweringPass.scala:77)
	at is.hail.expr.ir.LowerOrInterpretNonCompilable$.evaluate$1(LowerOrInterpretNonCompilable.scala:27)
	at is.hail.expr.ir.LowerOrInterpretNonCompilable$.rewrite$1(LowerOrInterpretNonCompilable.scala:67)
	at is.hail.expr.ir.LowerOrInterpretNonCompilable$.rewrite$1(LowerOrInterpretNonCompilable.scala:53)
	at is.hail.expr.ir.LowerOrInterpretNonCompilable$.apply(LowerOrInterpretNonCompilable.scala:72)
	at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.transform(LoweringPass.scala:69)
	at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$3(LoweringPass.scala:16)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
	at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$1(LoweringPass.scala:16)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
	at is.hail.expr.ir.lowering.LoweringPass.apply(LoweringPass.scala:14)
	at is.hail.expr.ir.lowering.LoweringPass.apply$(LoweringPass.scala:13)
	at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.apply(LoweringPass.scala:64)
	at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1(LoweringPipeline.scala:15)
	at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1$adapted(LoweringPipeline.scala:13)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:13)
	at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:47)
	at is.hail.backend.spark.SparkBackend._execute(SparkBackend.scala:383)
	at is.hail.backend.spark.SparkBackend.$anonfun$executeEncode$2(SparkBackend.scala:419)
	at is.hail.backend.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:48)
	at is.hail.utils.package$.using(package.scala:638)
	at is.hail.backend.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:48)
	at is.hail.utils.package$.using(package.scala:638)
	at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
	at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:47)
	at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:277)
	at is.hail.backend.spark.SparkBackend.$anonfun$executeEncode$1(SparkBackend.scala:416)
	at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
	at is.hail.backend.spark.SparkBackend.executeEncode(SparkBackend.scala:415)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)

is.hail.utils.HailException: Premature end of file: expected 4 bytes, found 0
	at is.hail.utils.ErrorHandling.fatal(ErrorHandling.scala:11)
	at is.hail.utils.ErrorHandling.fatal$(ErrorHandling.scala:11)
	at is.hail.utils.package$.fatal(package.scala:78)
	at is.hail.utils.richUtils.RichInputStream$.readFully$extension1(RichInputStream.scala:13)
	at is.hail.io.StreamBlockInputBuffer.readBlock(InputBuffers.scala:546)
	at is.hail.io.LZ4InputBlockBuffer.readBlock(InputBuffers.scala:584)
	at is.hail.io.BlockingInputBuffer.readBlock(InputBuffers.scala:382)
	at is.hail.io.BlockingInputBuffer.ensure(InputBuffers.scala:388)
	at is.hail.io.BlockingInputBuffer.readByte(InputBuffers.scala:405)
	at is.hail.io.LEB128InputBuffer.readByte(InputBuffers.scala:217)
	at is.hail.io.LEB128InputBuffer.readInt(InputBuffers.scala:223)
	at __C735collect_distributed_array.__m752SKIP_o_binary(Unknown Source)
	at __C735collect_distributed_array.__m774SKIP_r_struct_of_o_binaryANDo_int32ANDo_int32ANDo_int32END(Unknown Source)
	at __C735collect_distributed_array.__m773SKIP_r_array_of_r_struct_of_o_binaryANDo_int32ANDo_int32ANDo_int32END(Unknown Source)
	at __C735collect_distributed_array.__m772DECODE_r_struct_of_r_array_of_r_struct_of_o_binaryANDo_int32ANDo_int32ANDo_int32ENDEND_TO_SBaseStructPointer(Unknown Source)
	at __C735collect_distributed_array.__m743split_StreamLen(Unknown Source)
	at __C735collect_distributed_array.apply(Unknown Source)
	at __C735collect_distributed_array.apply(Unknown Source)
	at is.hail.backend.BackendUtils.$anonfun$collectDArray$2(BackendUtils.scala:31)
	at is.hail.utils.package$.using(package.scala:638)
	at is.hail.annotations.RegionPool.scopedRegion(RegionPool.scala:162)
	at is.hail.backend.BackendUtils.$anonfun$collectDArray$1(BackendUtils.scala:30)
	at is.hail.backend.spark.SparkBackendComputeRDD.compute(SparkBackend.scala:728)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)




Hail version: 0.2.91-44b441376f9a
Error summary: HailException: Premature end of file: expected 4 bytes, found 0

[Stage 14:=========================================>              (47 + 1) / 64]

My QC pipeline didn’t throw me any error, so I am assuming my code should be correct. Do you think that’s the issue of All of Us workbench or Hail?

Best,
Taotao

Hey @TaotaoTan !

Sorry you’re having trouble with Hail. This looks like a known bug with an old version of Hail. That matrix table file is unfortunately corrupt.

Do you know what version you’re using? Try hl.version(). If you can request a new version from the AoU team that will help expedite the update. I’ll also contact them.

Hi @danking

Thanks for your reply. The Hail used in All of Us is 0.2.91-44b441376f9a. I will try to report the issue.

– Taotao