Hi!
I am working with UK Biobank data using RAP (dnanexus). When attempting to count the number of rows and columns in my mt_match matrixtable, the following error is raised. Is it related with memory issues? The number of columns is almost 500.000 and rows around 500.000 too.
Is there any workaround?
Thank you in advance,
2023-11-23 18:10:34.712 Hail: INFO: Coerced sorted dataset
2023-11-23 18:10:39.786 Hail: INFO: wrote table with 534073 rows in 1 partition to /tmp/__iruid_6022-marrTkOu48TktFgtCVBxg7
FatalError Traceback (most recent call last)
Cell In[16], line 1
----> 1 mt_match.count()
File /opt/conda/lib/python3.9/site-packages/hail/matrixtable.py:2620, in MatrixTable.count(self)
2607 “”“Count the number of rows and columns in the matrix.
2608
2609 Examples
(…)
2617 Number of rows, number of cols.
2618 “””
2619 count_ir = ir.MatrixCount(self._mir)
→ 2620 return Env.backend().execute(count_ir)
File /opt/conda/lib/python3.9/site-packages/hail/backend/py4j_backend.py:82, in Py4JBackend.execute(self, ir, timed)
80 return (value, timings) if timed else value
81 except FatalError as e:
—> 82 raise e.maybe_user_error(ir) from None
File /opt/conda/lib/python3.9/site-packages/hail/backend/py4j_backend.py:76, in Py4JBackend.execute(self, ir, timed)
74 # print(self._hail_package.expr.ir.Pretty.apply(jir, True, -1))
75 try:
—> 76 result_tuple = self._jbackend.executeEncode(jir, stream_codec, timed)
77 (result, timings) = (result_tuple._1(), result_tuple._2())
78 value = ir.typ._from_encoding(result)
File /cluster/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py:1321, in JavaMember.call(self, *args)
1315 command = proto.CALL_COMMAND_NAME +
1316 self.command_header +
1317 args_command +
1318 proto.END_COMMAND_PART
1320 answer = self.gateway_client.send_command(command)
→ 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1324 for temp_arg in temp_args:
1325 temp_arg._detach()
File /opt/conda/lib/python3.9/site-packages/hail/backend/py4j_backend.py:35, in handle_java_exception..deco(*args, **kwargs)
33 tpl = Env.jutils().handleForPython(e.java_exception)
34 deepest, full, error_id = tpl._1(), tpl._2(), tpl._3()
—> 35 raise fatal_error_from_java_error_triplet(deepest, full, error_id) from None
36 except pyspark.sql.utils.CapturedException as e:
37 raise FatalError(‘%s\n\nJava stack trace:\n%s\n’
38 ‘Hail version: %s\n’
39 ‘Error summary: %s’ % (e.desc, e.stackTrace, hail.version, e.desc)) from None
FatalError: EOFException: Cannot seek after EOF
Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 9 in stage 11.0 failed 4 times, most recent failure: Lost task 9.3 in stage 11.0 (TID 54) (ip-10-60-47-141.eu-west-2.compute.internal executor 0): java.io.EOFException: Cannot seek after EOF
at org.apache.hadoop.fs.ChecksumFileSystem$FSDataBoundedInputStream.seek(ChecksumFileSystem.java:354)
at is.hail.io.fs.HadoopFS$$anon$2.seek(HadoopFS.scala:66)
at is.hail.io.fs.WrappedSeekableDataInputStream.seek(FS.scala:32)
at is.hail.io.HadoopFSDataBinaryReader.seek(HadoopFSDataBinaryReader.scala:13)
at __C8517collect_distributed_array_count_per_partition.__m8540split_StreamLen_region198_1166(Unknown Source)
at __C8517collect_distributed_array_count_per_partition.__m8540split_StreamLen(Unknown Source)
at __C8517collect_distributed_array_count_per_partition.apply(Unknown Source)
at __C8517collect_distributed_array_count_per_partition.apply(Unknown Source)
at is.hail.backend.BackendUtils.$anonfun$collectDArray$4(BackendUtils.scala:49)
at is.hail.utils.package$.using(package.scala:635)
at is.hail.annotations.RegionPool.scopedRegion(RegionPool.scala:162)
at is.hail.backend.BackendUtils.$anonfun$collectDArray$3(BackendUtils.scala:48)
at is.hail.backend.spark.SparkBackendComputeRDD.compute(SparkBackend.scala:793)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1491)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2552)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2497)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2496)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2496)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1254)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1254)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1254)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2740)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2682)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2671)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1022)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)
at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
at is.hail.backend.spark.SparkBackend.parallelizeAndComputeWithIndex(SparkBackend.scala:368)
at is.hail.backend.BackendUtils.collectDArray(BackendUtils.scala:44)
at __C8475Compiled.__m8477split_StreamFold(Emit.scala)
at __C8475Compiled.apply(Emit.scala)
at is.hail.expr.ir.CompileAndEvaluate$.anonfun_apply$7(CompileAndEvaluate.scala:74)
at scala.runtime.java8.JFunction0$mcJ$sp.apply(JFunction0$mcJ$sp.java:23)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:74)
at is.hail.expr.ir.CompileAndEvaluate$.evalToIR(CompileAndEvaluate.scala:33)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.evaluate$1(LowerOrInterpretNonCompilable.scala:30)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.rewrite$1(LowerOrInterpretNonCompilable.scala:67)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.apply(LowerOrInterpretNonCompilable.scala:72)
at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.transform(LoweringPass.scala:67)
at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$3(LoweringPass.scala:16)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$1(LoweringPass.scala:16)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass.apply(LoweringPass.scala:14)
at is.hail.expr.ir.lowering.LoweringPass.apply$(LoweringPass.scala:13)
at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.apply(LoweringPass.scala:62)
at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1(LoweringPipeline.scala:22)
at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1$adapted(LoweringPipeline.scala:20)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:20)
at is.hail.expr.ir.lowering.EvalRelationalLets$.execute$1(EvalRelationalLets.scala:10)
at is.hail.expr.ir.lowering.EvalRelationalLets$.lower$1(EvalRelationalLets.scala:18)
at is.hail.expr.ir.lowering.EvalRelationalLets$.apply(EvalRelationalLets.scala:37)
at is.hail.expr.ir.lowering.EvalRelationalLetsPass.transform(LoweringPass.scala:147)
at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$3(LoweringPass.scala:16)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$1(LoweringPass.scala:16)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass.apply(LoweringPass.scala:14)
at is.hail.expr.ir.lowering.LoweringPass.apply$(LoweringPass.scala:13)
at is.hail.expr.ir.lowering.EvalRelationalLetsPass.apply(LoweringPass.scala:141)
at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1(LoweringPipeline.scala:22)
at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1$adapted(LoweringPipeline.scala:20)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:20)
at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:50)
at is.hail.backend.spark.SparkBackend._execute(SparkBackend.scala:463)
at is.hail.backend.spark.SparkBackend.$anonfun$executeEncode$2(SparkBackend.scala:499)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:75)
at is.hail.utils.package$.using(package.scala:635)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:75)
at is.hail.utils.package$.using(package.scala:635)
at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:63)
at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:351)
at is.hail.backend.spark.SparkBackend.$anonfun$executeEncode$1(SparkBackend.scala:496)
at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
at is.hail.backend.spark.SparkBackend.executeEncode(SparkBackend.scala:495)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:750)
java.io.EOFException: Cannot seek after EOF
at org.apache.hadoop.fs.ChecksumFileSystem$FSDataBoundedInputStream.seek(ChecksumFileSystem.java:354)
at is.hail.io.fs.HadoopFS$$anon$2.seek(HadoopFS.scala:66)
at is.hail.io.fs.WrappedSeekableDataInputStream.seek(FS.scala:32)
at is.hail.io.HadoopFSDataBinaryReader.seek(HadoopFSDataBinaryReader.scala:13)
at __C8517collect_distributed_array_count_per_partition.__m8540split_StreamLen_region198_1166(Unknown Source)
at __C8517collect_distributed_array_count_per_partition.__m8540split_StreamLen(Unknown Source)
at __C8517collect_distributed_array_count_per_partition.apply(Unknown Source)
at __C8517collect_distributed_array_count_per_partition.apply(Unknown Source)
at is.hail.backend.BackendUtils.$anonfun$collectDArray$4(BackendUtils.scala:49)
at is.hail.utils.package$.using(package.scala:635)
at is.hail.annotations.RegionPool.scopedRegion(RegionPool.scala:162)
at is.hail.backend.BackendUtils.$anonfun$collectDArray$3(BackendUtils.scala:48)
at is.hail.backend.spark.SparkBackendComputeRDD.compute(SparkBackend.scala:793)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1491)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Hail version: 0.2.116-cd64e0876c94
Error summary: EOFException: Cannot seek after EOF