Hi @danielgoldstein , I use a single node with 2TB of RAM and 128 CPUs. The last lines of the log contain this (error seems to be different than before):
at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$_apply$4(CompileAndEvaluate.scala:60)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:84)
at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$_apply$2(CompileAndEvaluate.scala:60)
at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$_apply$2$adapted(CompileAndEvaluate.scala:58)
at is.hail.backend.ExecuteContext.$anonfun$scopedExecution$1(ExecuteContext.scala:144)
at is.hail.utils.package$.using(package.scala:664)
at is.hail.backend.ExecuteContext.scopedExecution(ExecuteContext.scala:144)
at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:58)
at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$apply$1(CompileAndEvaluate.scala:17)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:84)
at is.hail.expr.ir.CompileAndEvaluate$.apply(CompileAndEvaluate.scala:17)
at is.hail.expr.ir.TableWriter.apply(TableWriter.scala:51)
at is.hail.expr.ir.Interpret$.run(Interpret.scala:921)
at is.hail.expr.ir.Interpret$.alreadyLowered(Interpret.scala:66)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.evaluate$1(LowerOrInterpretNonCompilable.scala:20)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.rewrite$1(LowerOrInterpretNonCompilable.scala:59)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.apply(LowerOrInterpretNonCompilable.scala:64)
at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.transform(LoweringPass.scala:83)
at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$3(LoweringPass.scala:32)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:84)
at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$1(LoweringPass.scala:32)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:84)
at is.hail.expr.ir.lowering.LoweringPass.apply(LoweringPass.scala:30)
at is.hail.expr.ir.lowering.LoweringPass.apply$(LoweringPass.scala:29)
at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.apply(LoweringPass.scala:78)
at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1(LoweringPipeline.scala:21)
at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1$adapted(LoweringPipeline.scala:19)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:19)
at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:45)
at is.hail.backend.spark.SparkBackend._execute(SparkBackend.scala:600)
at is.hail.backend.spark.SparkBackend.$anonfun$execute$4(SparkBackend.scala:636)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:84)
at is.hail.backend.spark.SparkBackend.$anonfun$execute$3(SparkBackend.scala:631)
at is.hail.backend.spark.SparkBackend.$anonfun$execute$3$adapted(SparkBackend.scala:630)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:78)
at is.hail.utils.package$.using(package.scala:664)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:78)
at is.hail.utils.package$.using(package.scala:664)
at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:13)
at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:65)
at is.hail.backend.spark.SparkBackend.$anonfun$withExecuteContext$2(SparkBackend.scala:407)
at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:55)
at is.hail.utils.ExecutionTimer$.logTime(ExecutionTimer.scala:62)
at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:393)
at is.hail.backend.spark.SparkBackend.execute(SparkBackend.scala:630)
at is.hail.backend.BackendHttpHandler.handle(BackendServer.scala:88)
at com.sun.net.httpserver.Filter$Chain.doFilter(Filter.java:79)
at sun.net.httpserver.AuthFilter.doFilter(AuthFilter.java:83)
at com.sun.net.httpserver.Filter$Chain.doFilter(Filter.java:82)
at sun.net.httpserver.ServerImpl$Exchange$LinkHandler.handle(ServerImpl.java:822)
at com.sun.net.httpserver.Filter$Chain.doFilter(Filter.java:79)
at sun.net.httpserver.ServerImpl$Exchange.run(ServerImpl.java:794)
at sun.net.httpserver.ServerImpl$DefaultExecutor.execute(ServerImpl.java:199)
at sun.net.httpserver.ServerImpl$Dispatcher.handle(ServerImpl.java:544)
at sun.net.httpserver.ServerImpl$Dispatcher.run(ServerImpl.java:509)
at java.lang.Thread.run(Thread.java:750)
Hail version: 0.2.130-bea04d9c79b5
Error summary: SparkException: Job 16 cancelled because SparkContext was shut down
The first lines indicate an issue during ld_prune
:
File "<stdin>", line 1, in <module>
File "<decorator-gen-1774>", line 2, in ld_prune
File ".conda/envs/hail-0.2.127/lib/python3.12/site-packages/hail/typecheck/check.py", line 585, in wrapper
return __original_func(*args_, **kwargs_)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".conda/envs/hail-0.2.127/lib/python3.12/site-packages/hail/methods/statgen.py", line 4857, in ld_prune
variants_to_remove = hl.maximal_independent_set(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "<decorator-gen-1624>", line 2, in maximal_independent_set
File ".conda/envs/hail-0.2.127/lib/python3.12/site-packages/hail/typecheck/check.py", line 585, in wrapper
return __original_func(*args_, **kwargs_)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".conda/envs/hail-0.2.127/lib/python3.12/site-packages/hail/methods/misc.py", line 152, in maximal_independent_set
edges = edges.checkpoint(new_temp_file())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "<decorator-gen-1214>", line 2, in checkpoint
File ".conda/envs/hail-0.2.127/lib/python3.12/site-packages/hail/typecheck/check.py", line 585, in wrapper
return __original_func(*args_, **kwargs_)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".conda/envs/hail-0.2.127/lib/python3.12/site-packages/hail/table.py", line 1960, in checkpoint
self.write(output=output, overwrite=overwrite, stage_locally=stage_locally, _codec_spec=_codec_spec)
File "<decorator-gen-1216>", line 2, in write
File ".conda/envs/hail-0.2.127/lib/python3.12/site-packages/hail/typecheck/check.py", line 585, in wrapper
return __original_func(*args_, **kwargs_)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".conda/envs/hail-0.2.127/lib/python3.12/site-packages/hail/table.py", line 2002, in write
Env.backend().execute(
File ".conda/envs/hail-0.2.127/lib/python3.12/site-packages/hail/backend/spark_backend.py", line 226, in execute
raise err
File "conda/envs/hail-0.2.127/lib/python3.12/site-packages/hail/backend/spark_backend.py", line 218, in execute
return super().execute(ir, timed)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".conda/envs/hail-0.2.127/lib/python3.12/site-packages/hail/backend/backend.py", line 190, in execute
raise e.maybe_user_error(ir) from None
File "/.conda/envs/hail-0.2.127/lib/python3.12/site-packages/hail/backend/backend.py", line 188, in execute
result, timings = self._rpc(ActionTag.EXECUTE, payload)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "conda/envs/hail-0.2.127/lib/python3.12/site-packages/hail/backend/py4j_backend.py", line 221, in _rpc
raise fatal_error_from_java_error_triplet(
hail.utils.java.FatalError: SparkException: Job 16 cancelled because SparkContext was shut down