Hi Dan,
I upgraded Hail to 0.2.124, and VEP is working on the cluster, which is great, but I am running into a different error now, and I can’t tell from the error message if it is being caused by the input to VEP or something else.
Here is the code:
all_genomes_mt = hl.read_matrix_table( f'gs://{genomes_mt_path}' )
just_variants = all_genomes_mt.rows()
vep_results = hl.vep( just_variants )
And here is the full stacktrace:
Traceback (most recent call last): (16566 + 2504) / 121044]
File "/tmp/d9c64535d84a478bba71557458f7876a/run-vep.py", line 31, in <module>
vep_results = hl.vep( just_variants )
File "<decorator-gen-1752>", line 2, in vep
File "/opt/conda/default/lib/python3.10/site-packages/hail/typecheck/check.py", line 587, in wrapper
return __original_func(*args_, **kwargs_)
File "/opt/conda/default/lib/python3.10/site-packages/hail/methods/qc.py", line 1181, in vep
'tolerateParseError': tolerate_parse_error})).persist()
File "<decorator-gen-1224>", line 2, in persist
File "/opt/conda/default/lib/python3.10/site-packages/hail/typecheck/check.py", line 587, in wrapper
return __original_func(*args_, **kwargs_)
File "/opt/conda/default/lib/python3.10/site-packages/hail/table.py", line 2112, in persist
return Env.backend().persist(self)
File "/opt/conda/default/lib/python3.10/site-packages/hail/backend/backend.py", line 208, in persist
persisted = dataset.checkpoint(tempfile.__enter__())
File "<decorator-gen-1214>", line 2, in checkpoint
File "/opt/conda/default/lib/python3.10/site-packages/hail/typecheck/check.py", line 587, in wrapper
return __original_func(*args_, **kwargs_)
File "/opt/conda/default/lib/python3.10/site-packages/hail/table.py", line 1331, in checkpoint
self.write(output=output, overwrite=overwrite, stage_locally=stage_locally, _codec_spec=_codec_spec)
File "<decorator-gen-1216>", line 2, in write
File "/opt/conda/default/lib/python3.10/site-packages/hail/typecheck/check.py", line 587, in wrapper
return __original_func(*args_, **kwargs_)
File "/opt/conda/default/lib/python3.10/site-packages/hail/table.py", line 1377, in write
Env.backend().execute(ir.TableWrite(self._tir, ir.TableNativeWriter(output, overwrite, stage_locally, _codec_spec)))
File "/opt/conda/default/lib/python3.10/site-packages/hail/backend/py4j_backend.py", line 82, in execute
raise e.maybe_user_error(ir) from None
File "/opt/conda/default/lib/python3.10/site-packages/hail/backend/py4j_backend.py", line 76, in execute
result_tuple = self._jbackend.executeEncode(jir, stream_codec, timed)
File "/usr/lib/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1321, in __call__
File "/opt/conda/default/lib/python3.10/site-packages/hail/backend/py4j_backend.py", line 35, in deco
raise fatal_error_from_java_error_triplet(deepest, full, error_id) from None
hail.utils.java.FatalError: SparkException: Job 0 cancelled because SparkContext was shut down
Java stack trace:
org.apache.spark.SparkException: Job 0 cancelled because SparkContext was shut down
at org.apache.spark.scheduler.DAGScheduler.$anonfun$cleanUpAfterSchedulerStop$1(DAGScheduler.scala:1188)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$cleanUpAfterSchedulerStop$1$adapted(DAGScheduler.scala:1186)
at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:1186)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:2887)
at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:2784)
at org.apache.spark.SparkContext.$anonfun$stop$11(SparkContext.scala:2095)
at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1484)
at org.apache.spark.SparkContext.stop(SparkContext.scala:2095)
at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:125)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2293)
at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1021)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
at org.apache.spark.rdd.RDD.collect(RDD.scala:1020)
at is.hail.backend.spark.SparkBackend.parallelizeAndComputeWithIndex(SparkBackend.scala:406)
at is.hail.backend.BackendUtils.collectDArray(BackendUtils.scala:86)
at __C832Compiled.__m836split_Let(Emit.scala)
at __C832Compiled.apply(Emit.scala)
at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$_apply$4(CompileAndEvaluate.scala:61)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$_apply$2(CompileAndEvaluate.scala:61)
at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$_apply$2$adapted(CompileAndEvaluate.scala:59)
at is.hail.backend.ExecuteContext.$anonfun$scopedExecution$1(ExecuteContext.scala:140)
at is.hail.utils.package$.using(package.scala:637)
at is.hail.backend.ExecuteContext.scopedExecution(ExecuteContext.scala:140)
at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:59)
at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$apply$1(CompileAndEvaluate.scala:19)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.CompileAndEvaluate$.apply(CompileAndEvaluate.scala:19)
at is.hail.expr.ir.TableWriter.apply(TableWriter.scala:46)
at is.hail.expr.ir.Interpret$.run(Interpret.scala:865)
at is.hail.expr.ir.Interpret$.alreadyLowered(Interpret.scala:59)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.evaluate$1(LowerOrInterpretNonCompilable.scala:20)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.rewrite$1(LowerOrInterpretNonCompilable.scala:58)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.apply(LowerOrInterpretNonCompilable.scala:63)
at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.transform(LoweringPass.scala:77)
at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$3(LoweringPass.scala:26)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$1(LoweringPass.scala:26)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass.apply(LoweringPass.scala:24)
at is.hail.expr.ir.lowering.LoweringPass.apply$(LoweringPass.scala:23)
at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.apply(LoweringPass.scala:72)
at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1(LoweringPipeline.scala:22)
at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1$adapted(LoweringPipeline.scala:20)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:20)
at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:50)
at is.hail.backend.spark.SparkBackend._execute(SparkBackend.scala:505)
at is.hail.backend.spark.SparkBackend.$anonfun$executeEncode$2(SparkBackend.scala:541)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:76)
at is.hail.utils.package$.using(package.scala:637)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:76)
at is.hail.utils.package$.using(package.scala:637)
at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:62)
at is.hail.backend.spark.SparkBackend.$anonfun$withExecuteContext$1(SparkBackend.scala:345)
at is.hail.backend.spark.SparkBackend.$anonfun$executeEncode$1(SparkBackend.scala:538)
at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
at is.hail.backend.spark.SparkBackend.executeEncode(SparkBackend.scala:537)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:829)
The Hail logs on the master node say the error preceding the shutdown of the SparkContext was an "error while applying lowering ‘LowerOrInterpretNonCompilable’, presumably in reference to is.hail.expr.ir.LowerOrInterpretNonCompilable
, but I don’t know what that function does (I looked at the source, but couldn’t even guess what it was trying to do, specifically). Any idea where I should start troubleshooting?
Thanks,
Daniel Cotter