Hi,
I did some sample and variants QC in All of Us using Hail, and write the filtered table as a new matrix.table. I tried to read the new filtered table, but encountered issue of retrieving variant level information. For example,
mt_wgs_filtered.show() # won't work
mt_wgs_filtered.sample_qc.call_rate.show() # works
mt_wgs_filtered.variant_qc.call_rate.show() # won't work
The error message looks like:
FatalError: HailException: Premature end of file: expected 4 bytes, found 0
Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 39 in stage 11.0 failed 4 times, most recent failure: Lost task 39.3 in stage 11.0 (TID 250) (all-of-us-2936-sw-t5dl.c.terra-vpc-sc-c336a349.internal executor 3): is.hail.utils.HailException: Premature end of file: expected 4 bytes, found 0
at is.hail.utils.ErrorHandling.fatal(ErrorHandling.scala:11)
at is.hail.utils.ErrorHandling.fatal$(ErrorHandling.scala:11)
at is.hail.utils.package$.fatal(package.scala:78)
at is.hail.utils.richUtils.RichInputStream$.readFully$extension1(RichInputStream.scala:13)
at is.hail.io.StreamBlockInputBuffer.readBlock(InputBuffers.scala:546)
at is.hail.io.LZ4InputBlockBuffer.readBlock(InputBuffers.scala:584)
at is.hail.io.BlockingInputBuffer.readBlock(InputBuffers.scala:382)
at is.hail.io.BlockingInputBuffer.ensure(InputBuffers.scala:388)
at is.hail.io.BlockingInputBuffer.readByte(InputBuffers.scala:405)
at is.hail.io.LEB128InputBuffer.readByte(InputBuffers.scala:217)
at is.hail.io.LEB128InputBuffer.readInt(InputBuffers.scala:223)
at __C735collect_distributed_array.__m752SKIP_o_binary(Unknown Source)
at __C735collect_distributed_array.__m774SKIP_r_struct_of_o_binaryANDo_int32ANDo_int32ANDo_int32END(Unknown Source)
at __C735collect_distributed_array.__m773SKIP_r_array_of_r_struct_of_o_binaryANDo_int32ANDo_int32ANDo_int32END(Unknown Source)
at __C735collect_distributed_array.__m772DECODE_r_struct_of_r_array_of_r_struct_of_o_binaryANDo_int32ANDo_int32ANDo_int32ENDEND_TO_SBaseStructPointer(Unknown Source)
at __C735collect_distributed_array.__m743split_StreamLen(Unknown Source)
at __C735collect_distributed_array.apply(Unknown Source)
at __C735collect_distributed_array.apply(Unknown Source)
at is.hail.backend.BackendUtils.$anonfun$collectDArray$2(BackendUtils.scala:31)
at is.hail.utils.package$.using(package.scala:638)
at is.hail.annotations.RegionPool.scopedRegion(RegionPool.scala:162)
at is.hail.backend.BackendUtils.$anonfun$collectDArray$1(BackendUtils.scala:30)
at is.hail.backend.spark.SparkBackendComputeRDD.compute(SparkBackend.scala:728)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2259)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2208)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2207)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2207)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2446)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2388)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2377)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2204)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2225)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2244)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2269)
at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
at is.hail.backend.spark.SparkBackend.parallelizeAndComputeWithIndex(SparkBackend.scala:288)
at is.hail.backend.BackendUtils.collectDArray(BackendUtils.scala:28)
at __C715Compiled.__m718split_TailLoop(Emit.scala)
at __C715Compiled.__m716split_ToArray(Emit.scala)
at __C715Compiled.apply(Emit.scala)
at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$_apply$6(CompileAndEvaluate.scala:68)
at scala.runtime.java8.JFunction0$mcJ$sp.apply(JFunction0$mcJ$sp.java:23)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:68)
at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$apply$1(CompileAndEvaluate.scala:19)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.CompileAndEvaluate$.apply(CompileAndEvaluate.scala:19)
at is.hail.expr.ir.lowering.LowerTableIR$.applyTable(LowerTableIR.scala:957)
at is.hail.expr.ir.lowering.LowerTableIR$.lower$2(LowerTableIR.scala:620)
at is.hail.expr.ir.lowering.LowerTableIR$.applyTable(LowerTableIR.scala:1076)
at is.hail.expr.ir.lowering.LowerTableIR$.lower$1(LowerTableIR.scala:453)
at is.hail.expr.ir.lowering.LowerTableIR$.apply(LowerTableIR.scala:471)
at is.hail.expr.ir.lowering.LowerToCDA$.lower(LowerToCDA.scala:69)
at is.hail.expr.ir.lowering.LowerToCDA$.apply(LowerToCDA.scala:18)
at is.hail.expr.ir.lowering.LowerToDistributedArrayPass.transform(LoweringPass.scala:77)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.evaluate$1(LowerOrInterpretNonCompilable.scala:27)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.rewrite$1(LowerOrInterpretNonCompilable.scala:67)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.rewrite$1(LowerOrInterpretNonCompilable.scala:53)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.apply(LowerOrInterpretNonCompilable.scala:72)
at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.transform(LoweringPass.scala:69)
at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$3(LoweringPass.scala:16)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$1(LoweringPass.scala:16)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass.apply(LoweringPass.scala:14)
at is.hail.expr.ir.lowering.LoweringPass.apply$(LoweringPass.scala:13)
at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.apply(LoweringPass.scala:64)
at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1(LoweringPipeline.scala:15)
at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1$adapted(LoweringPipeline.scala:13)
at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:13)
at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:47)
at is.hail.backend.spark.SparkBackend._execute(SparkBackend.scala:383)
at is.hail.backend.spark.SparkBackend.$anonfun$executeEncode$2(SparkBackend.scala:419)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:48)
at is.hail.utils.package$.using(package.scala:638)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:48)
at is.hail.utils.package$.using(package.scala:638)
at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:47)
at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:277)
at is.hail.backend.spark.SparkBackend.$anonfun$executeEncode$1(SparkBackend.scala:416)
at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
at is.hail.backend.spark.SparkBackend.executeEncode(SparkBackend.scala:415)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
is.hail.utils.HailException: Premature end of file: expected 4 bytes, found 0
at is.hail.utils.ErrorHandling.fatal(ErrorHandling.scala:11)
at is.hail.utils.ErrorHandling.fatal$(ErrorHandling.scala:11)
at is.hail.utils.package$.fatal(package.scala:78)
at is.hail.utils.richUtils.RichInputStream$.readFully$extension1(RichInputStream.scala:13)
at is.hail.io.StreamBlockInputBuffer.readBlock(InputBuffers.scala:546)
at is.hail.io.LZ4InputBlockBuffer.readBlock(InputBuffers.scala:584)
at is.hail.io.BlockingInputBuffer.readBlock(InputBuffers.scala:382)
at is.hail.io.BlockingInputBuffer.ensure(InputBuffers.scala:388)
at is.hail.io.BlockingInputBuffer.readByte(InputBuffers.scala:405)
at is.hail.io.LEB128InputBuffer.readByte(InputBuffers.scala:217)
at is.hail.io.LEB128InputBuffer.readInt(InputBuffers.scala:223)
at __C735collect_distributed_array.__m752SKIP_o_binary(Unknown Source)
at __C735collect_distributed_array.__m774SKIP_r_struct_of_o_binaryANDo_int32ANDo_int32ANDo_int32END(Unknown Source)
at __C735collect_distributed_array.__m773SKIP_r_array_of_r_struct_of_o_binaryANDo_int32ANDo_int32ANDo_int32END(Unknown Source)
at __C735collect_distributed_array.__m772DECODE_r_struct_of_r_array_of_r_struct_of_o_binaryANDo_int32ANDo_int32ANDo_int32ENDEND_TO_SBaseStructPointer(Unknown Source)
at __C735collect_distributed_array.__m743split_StreamLen(Unknown Source)
at __C735collect_distributed_array.apply(Unknown Source)
at __C735collect_distributed_array.apply(Unknown Source)
at is.hail.backend.BackendUtils.$anonfun$collectDArray$2(BackendUtils.scala:31)
at is.hail.utils.package$.using(package.scala:638)
at is.hail.annotations.RegionPool.scopedRegion(RegionPool.scala:162)
at is.hail.backend.BackendUtils.$anonfun$collectDArray$1(BackendUtils.scala:30)
at is.hail.backend.spark.SparkBackendComputeRDD.compute(SparkBackend.scala:728)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Hail version: 0.2.91-44b441376f9a
Error summary: HailException: Premature end of file: expected 4 bytes, found 0
[Stage 14:=========================================> (47 + 1) / 64]
My QC pipeline didn’t throw me any error, so I am assuming my code should be correct. Do you think that’s the issue of All of Us workbench or Hail?
Best,
Taotao