ResultStage has failed the maximum allowable number of times

Hi hail team!

I’m running into this error when trying to run some regional missense constraint code (https://github.com/broadinstitute/regional_missense_constraint/blob/simul_fix/rmc/pipeline/regional_constraint.py#L352, which calls https://github.com/broadinstitute/regional_missense_constraint/blob/simul_fix/rmc/utils/constraint.py#L1226):

Traceback (most recent call last):
  File "/tmp/a623cf71902d466daac2aa5f4ea0c68a/regional_constraint.py", line 791, in <module>
    main(args)
  File "/tmp/a623cf71902d466daac2aa5f4ea0c68a/regional_constraint.py", line 413, in main
    context_ht = search_two_break_windows(
  File "/tmp/a623cf71902d466daac2aa5f4ea0c68a/pyscripts_qvum0dqg.zip/rmc/utils/constraint.py", line 1367, in search_two_break_windows
  File "<decorator-gen-1103>", line 2, in checkpoint
  File "/opt/conda/default/lib/python3.8/site-packages/hail/typecheck/check.py", line 577, in wrapper
    return __original_func(*args_, **kwargs_)
  File "/opt/conda/default/lib/python3.8/site-packages/hail/table.py", line 1238, in checkpoint
    self.write(output=output, overwrite=overwrite, stage_locally=stage_locally, _codec_spec=_codec_spec)
  File "<decorator-gen-1105>", line 2, in write
  File "/opt/conda/default/lib/python3.8/site-packages/hail/typecheck/check.py", line 577, in wrapper
    return __original_func(*args_, **kwargs_)
  File "/opt/conda/default/lib/python3.8/site-packages/hail/table.py", line 1271, in write
    Env.backend().execute(ir.TableWrite(self._tir, ir.TableNativeWriter(output, overwrite, stage_locally, _codec_spec)))
  File "/opt/conda/default/lib/python3.8/site-packages/hail/backend/py4j_backend.py", line 98, in execute
    raise e
  File "/opt/conda/default/lib/python3.8/site-packages/hail/backend/py4j_backend.py", line 74, in execute
    result = json.loads(self._jhc.backend().executeJSON(jir))
  File "/usr/lib/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1304, in __call__
  File "/opt/conda/default/lib/python3.8/site-packages/hail/backend/py4j_backend.py", line 30, in deco
    raise FatalError('%s\n\nJava stack trace:\n%s\n'
hail.utils.java.FatalError: SparkException: Job aborted due to stage failure: ResultStage 13 (runJob at ContextRDD.scala:238) has failed the maximum allowable number of times: 4. Most recent failure reason: org.apache.spark.shuffle.FetchFailedException 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:770) 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:685) 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:70) 	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29) 	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486) 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492) 	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) 	at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:31) 	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) 	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:200) 	at org.apache.spark.shuffle.BlockStoreShuffleReader.read(BlockStoreShuffleReader.scala:128) 	at org.apache.spark.rdd.ShuffledRDD.compute(ShuffledRDD.scala:106) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:89) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) 	at org.apache.spark.scheduler.Task.run(Task.scala:131) 	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497) 	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) 	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500) 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 	at java.lang.Thread.run(Thread.java:748) Caused by: java.nio.channels.ClosedChannelException 	at org.apache.spark.network.client.StreamInterceptor.channelInactive(StreamInterceptor.java:62) 	at org.apache.spark.network.util.TransportFrameDecoder.channelInactive(TransportFrameDecoder.java:223) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:262) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:248) 	at io.netty.channel.AbstractChannelHandlerContext.fireChannelInactive(AbstractChannelHandlerContext.java:241) 	at io.netty.channel.DefaultChannelPipeline$HeadContext.channelInactive(DefaultChannelPipeline.java:1405) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:262) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:248) 	at io.netty.channel.DefaultChannelPipeline.fireChannelInactive(DefaultChannelPipeline.java:901) 	at io.netty.channel.AbstractChannel$AbstractUnsafe$8.run(AbstractChannel.java:818) 	at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:164) 	at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:472) 	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:497) 	at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) 	at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) 	at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) 	... 1 more

Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: ResultStage 13 (runJob at ContextRDD.scala:238) has failed the maximum allowable number of times: 4. Most recent failure reason: org.apache.spark.shuffle.FetchFailedException 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:770) 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:685) 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:70) 	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29) 	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486) 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492) 	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) 	at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:31) 	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) 	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:200) 	at org.apache.spark.shuffle.BlockStoreShuffleReader.read(BlockStoreShuffleReader.scala:128) 	at org.apache.spark.rdd.ShuffledRDD.compute(ShuffledRDD.scala:106) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:89) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) 	at org.apache.spark.scheduler.Task.run(Task.scala:131) 	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497) 	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) 	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500) 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 	at java.lang.Thread.run(Thread.java:748) Caused by: java.nio.channels.ClosedChannelException 	at org.apache.spark.network.client.StreamInterceptor.channelInactive(StreamInterceptor.java:62) 	at org.apache.spark.network.util.TransportFrameDecoder.channelInactive(TransportFrameDecoder.java:223) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:262) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:248) 	at io.netty.channel.AbstractChannelHandlerContext.fireChannelInactive(AbstractChannelHandlerContext.java:241) 	at io.netty.channel.DefaultChannelPipeline$HeadContext.channelInactive(DefaultChannelPipeline.java:1405) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:262) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:248) 	at io.netty.channel.DefaultChannelPipeline.fireChannelInactive(DefaultChannelPipeline.java:901) 	at io.netty.channel.AbstractChannel$AbstractUnsafe$8.run(AbstractChannel.java:818) 	at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:164) 	at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:472) 	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:497) 	at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) 	at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) 	at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) 	... 1 more
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2254)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2203)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2202)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2202)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1763)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2438)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2383)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2372)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2223)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2255)
	at is.hail.sparkextras.ContextRDD.crunJobWithIndex(ContextRDD.scala:238)
	at is.hail.rvd.RVD$.getKeyInfo(RVD.scala:1264)
	at is.hail.rvd.RVD$.makeCoercer(RVD.scala:1339)
	at is.hail.rvd.RVD$.coerce(RVD.scala:1295)
	at is.hail.rvd.RVD.changeKey(RVD.scala:176)
	at is.hail.rvd.RVD.changeKey(RVD.scala:169)
	at is.hail.rvd.RVD.enforceKey(RVD.scala:161)
	at is.hail.expr.ir.TableKeyBy.execute(TableIR.scala:1263)
	at is.hail.expr.ir.TableMapRows.execute(TableIR.scala:1903)
	at is.hail.expr.ir.Interpret$.run(Interpret.scala:790)
	at is.hail.expr.ir.Interpret$.alreadyLowered(Interpret.scala:56)
	at is.hail.expr.ir.InterpretNonCompilable$.interpretAndCoerce$1(InterpretNonCompilable.scala:16)
	at is.hail.expr.ir.InterpretNonCompilable$.rewrite$1(InterpretNonCompilable.scala:53)
	at is.hail.expr.ir.InterpretNonCompilable$.apply(InterpretNonCompilable.scala:58)
	at is.hail.expr.ir.lowering.InterpretNonCompilablePass$.transform(LoweringPass.scala:67)
	at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$3(LoweringPass.scala:15)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
	at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$1(LoweringPass.scala:15)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
	at is.hail.expr.ir.lowering.LoweringPass.apply(LoweringPass.scala:13)
	at is.hail.expr.ir.lowering.LoweringPass.apply$(LoweringPass.scala:12)
	at is.hail.expr.ir.lowering.InterpretNonCompilablePass$.apply(LoweringPass.scala:62)
	at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1(LoweringPipeline.scala:14)
	at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1$adapted(LoweringPipeline.scala:12)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:12)
	at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:29)
	at is.hail.backend.spark.SparkBackend._execute(SparkBackend.scala:381)
	at is.hail.backend.spark.SparkBackend.$anonfun$execute$1(SparkBackend.scala:365)
	at is.hail.expr.ir.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:47)
	at is.hail.utils.package$.using(package.scala:627)
	at is.hail.expr.ir.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:47)
	at is.hail.utils.package$.using(package.scala:627)
	at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
	at is.hail.expr.ir.ExecuteContext$.scoped(ExecuteContext.scala:46)
	at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:275)
	at is.hail.backend.spark.SparkBackend.execute(SparkBackend.scala:362)
	at is.hail.backend.spark.SparkBackend.$anonfun$executeJSON$1(SparkBackend.scala:406)
	at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
	at is.hail.backend.spark.SparkBackend.executeJSON(SparkBackend.scala:404)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



Hail version: 0.2.74-0c3a74d12093
Error summary: SparkException: Job aborted due to stage failure: ResultStage 13 (runJob at ContextRDD.scala:238) has failed the maximum allowable number of times: 4. Most recent failure reason: org.apache.spark.shuffle.FetchFailedException 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:770) 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:685) 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:70) 	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29) 	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486) 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492) 	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) 	at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:31) 	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) 	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:200) 	at org.apache.spark.shuffle.BlockStoreShuffleReader.read(BlockStoreShuffleReader.scala:128) 	at org.apache.spark.rdd.ShuffledRDD.compute(ShuffledRDD.scala:106) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:89) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) 	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) 	at org.apache.spark.scheduler.Task.run(Task.scala:131) 	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497) 	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) 	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500) 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 	at java.lang.Thread.run(Thread.java:748) Caused by: java.nio.channels.ClosedChannelException 	at org.apache.spark.network.client.StreamInterceptor.channelInactive(StreamInterceptor.java:62) 	at org.apache.spark.network.util.TransportFrameDecoder.channelInactive(TransportFrameDecoder.java:223) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:262) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:248) 	at io.netty.channel.AbstractChannelHandlerContext.fireChannelInactive(AbstractChannelHandlerContext.java:241) 	at io.netty.channel.DefaultChannelPipeline$HeadContext.channelInactive(DefaultChannelPipeline.java:1405) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:262) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:248) 	at io.netty.channel.DefaultChannelPipeline.fireChannelInactive(DefaultChannelPipeline.java:901) 	at io.netty.channel.AbstractChannel$AbstractUnsafe$8.run(AbstractChannel.java:818) 	at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:164) 	at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:472) 	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:497) 	at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) 	at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) 	at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) 	... 1 more
ERROR: (gcloud.dataproc.jobs.wait) Job [a623cf71902d466daac2aa5f4ea0c68a] failed with error:
Google Cloud Dataproc Agent reports job failure. If logs are available, they can be found at:
https://console.cloud.google.com/dataproc/jobs/a623cf71902d466daac2aa5f4ea0c68a?project=broad-mpg-gnomad&region=us-central1
gcloud dataproc jobs wait 'a623cf71902d466daac2aa5f4ea0c68a' --region 'us-central1' --project 'broad-mpg-gnomad'
https://console.cloud.google.com/storage/browser/dataproc-faa46220-ec08-4f5b-92bd-9722e1963047-us-central1/google-cloud-dataproc-metainfo/6128cd07-3075-4284-9f6a-88ace61011e9/jobs/a623cf71902d466daac2aa5f4ea0c68a/
gs://dataproc-faa46220-ec08-4f5b-92bd-9722e1963047-us-central1/google-cloud-dataproc-metainfo/6128cd07-3075-4284-9f6a-88ace61011e9/jobs/a623cf71902d466daac2aa5f4ea0c68a/driveroutput

I have the log as well, but it looks like it’s a bit too big to upload here.

This last job (with the error above) ran for 1 hour, 33 minutes on 30 non-preemptible highmem-8s (with 200GB disk space) before failing. I also tried running this job on 50 non-preemptible highmem-8s with 100GB disk space a couple times before that and saw the same error (all errors started with ResultStage 13 (runJob at ContextRDD.scala:238) has failed the maximum allowable number of times: 4).

Should I use more workers? Or add more disk space, or something else? I’d appreciate any tips – I’m hoping to generate these results before ASHG. Also, that repo is currently private, but I’m happy to share with anyone who doesn’t have read access!

Likely executors are still dying, and the 2 usual culprits are memory and disk (in that order). Knowing your pipeline, disk would be unlikely but you never know. You could look at executor logs to get an idea, or ssh into a machine and watch its usage as the job is happening. You can also try the horrible trick of “doubling your memory” (by halving your number of cores), which I think for -8’s was: --properties 'spark.executor.memory=15g,spark.yarn.executor.memoryOverhead=25g'

1 Like

thanks for the suggestions Konrad! I added this to my cluster of 30 highmem-8s (pulled out of my notes from when I last ran a densify): --properties 'spark:spark.executor.memoryOverhead=14g,spark:spark.executor.memory=12g'. I also monitored the workers using the spark-ui and free -h on a few workers. It doesn’t seem like the worker nodes used disk space (the “Disk Used” section showed zeroes every time I checked), and none of the workers used more than 13GB of memory.

the job crashed again after 1 hr 26 min at the same step – I can try using the properties you sent instead, do you think that would help?

You can email us the log if it’s too big to post. I am currently pretty confused about what’s going on.

ah sorry about that – just sent the logs I have to hail-team@broadinstitute.org

There’s an issue here:

https://github.com/broadinstitute/regional_missense_constraint/blob/simul_fix/rmc/pipeline/regional_constraint.py#L383-L390

context_ht = context_ht.annotate(
    start_pos=transcript_ht[context_ht.transcript].start_pos,
    end_pos=transcript_ht[context_ht.transcript].end_pos,
    transcript_size=(
                            transcript_ht[context_ht.transcript].end_pos
                            - transcript_ht[context_ht.transcript].start_pos
                    )
                    + 1,
)

This is doing four shuffle joins – one for each time that transcript_ht appears, I believe. You could reimplement this as:

context_ht = context_ht.annotate(tx = transcript_ht[context_ht.transcript])
context_ht = context_ht.annotate(start_pos=context_ht.start_pos, ...)

2 Likes

ooh good catch. I’ll try that, thank you!!

ah that doesn’t seem to have fixed it – will send the newest log

Hey @ch-kr!

We’re still digging into this. Could you share which arguments you supplied to the gnomad scripts? For example, is args.pre_process_data enabled?

Also, do you know if the {temp_path}/transcript.ht is successfully written? This is around line 380. of regional_constraint.py.

thank you for looking into this!!

this is the command I used:

hailctl dataproc submit kc pipeline/regional_constraint.py --search-for-simul-breaks --slack-channel "@kc (she/her)" --chisq-threshold 6.6

re: transcript HT: yup! I actually wrote the HT in June, here’s the schema:

----------------------------------------
File Type: Table
    Partitions: 57735
    Rows: 57735
    Empty partitions: 0
    Min(rows/partition): 1
    Max(rows/partition): 1
    Median(rows/partition): 1
    Mean(rows/partition): 1
    StdDev(rows/partition): 0
----------------------------------------
Global fields:

----------------------------------------
Row fields:
    'transcript': str
    'end_pos': int32
    'start_pos': int32
----------------------------------------
Key: ['transcript']
----------------------------------------

Great. And I assume you’re stuck trying to write the checkpoint file: {temp_path}/simul_breaks.ht, yeah?

EDIT:

Seems like you’d actually be stuck earlier at: f"{temp_path}/simul_break_{window_size}_temp.ht"

Actually, @ch-kr , do you have the output log from this job? I’m trying to narrow down which part of the Python code corresponds to your issue. It’d also be helpful to know which temp tables were created and which were not, if you still have that information.

yup! I’m running it now (actually sorry it just crashed), and it seems to be failing here: https://github.com/broadinstitute/regional_missense_constraint/blob/simul_fix/rmc/utils/constraint.py#L1378 output.txt (58.1 KB)

Cluster: Google Cloud Platform
Job page: Google Cloud Platform

OK, just to confirm, that output.txt seems to suggest you fail on this checkpoint. Does that agree with your understanding? And also just confirming: the code you’re running is the same as the code pushed to GitHub in the branch simul_fix?

Here’s a theory, in constraint.py lines 637-638

group_ht = ht.group_by(search_field).aggregate(max_chisq=hl.agg.max(ht.chisq))
ht = ht.annotate(max_chisq=group_ht[ht.transcript].max_chisq)

Hail implements this as three shuffles. The first shuffle computes the max chisq statistic per-transcript group. The second shuffle builds a Table which maps from transcript to (locus, alleles). The third shuffle sends the per-transcript table back to a per-locus-&-alleles table.

Can you try adding

group_ht = group_ht.checkpoint(...)

after line 637 (the group_by line)?

I think that will alleviate some of the pressure on the Spark workers during these shuffles by splitting them into two separate Spark runs.

1 Like

I wanted to send a quick update that I just tried running this code again (dropping more annotations before checkpointing – I had a drop here https://github.com/broadinstitute/regional_missense_constraint/blob/simul_fix/rmc/utils/constraint.py#L1373 but changed it to the select that is currently on that branch of the repo), and I finally managed to checkpoint the table!

thanks also for the new suggestion! I’ll take a closer look (just noticed it while typing this reply) and also try that out. also, for your first two questions: yes and yes (code was failing at that checkpoint, and I’m running the code on the simul_fix branch)

1 Like

Awesome! That was another thing I noticed in search_for_break. It looks like you create a number of annotations that you use to calculate the chisq value. Then you group_by().aggregate() which destroys those annotations on group_ht, but those annotations still exist on ht. It might be valuable to, in general, drop those annotations from ht before the annotate on line 638.

1 Like

great point about the annotations – I’ll take another look through the code to clean it up a bit more.

thank you for all of the help digging into this issue!! I really appreciate the tips about which parts of the code trigger shuffles and could benefit from checkpoints.

hi again, and thanks for the tips from before! I’m running into a new issue trying to run the same pieces of code: I’m trying to run this function (https://github.com/broadinstitute/regional_missense_constraint/blob/simul_fix/rmc/utils/constraint.py#L1229), but it seems to be crashing at this checkpoint (https://github.com/broadinstitute/regional_missense_constraint/blob/simul_fix/rmc/utils/constraint.py#L1384).

Output:

Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.75-52791e9868e4
LOGGING: writing to /RMC.log
INFO (regional_missense_constraint 356): Searching for two simultaneous breaks in transcripts that didn't have                 a single significant break...
INFO (regional_missense_constraint 376): Getting start and end positions and total size for each transcript...
INFO (regional_missense_constraint_generic 247): Total number of bases in the exome: 54426835
INFO (regional_missense_constraint_generic 248): Total number of missense variants in gnomAD exomes: 5257859
INFO (regional_missense_constraint_generic 251): Getting average bases between missense variants and returning...
INFO (regional_missense_constraint 422): Minimum window size (window size needed to observe 10 missense variants on average): 100
INFO (regional_missense_constraint 428): Searching for transcripts with simultaneous breaks...
INFO (constraint_utils 1273): Getting smallest window end and post-window positions...
INFO (constraint_utils 1274): Also getting maximum simultaneous break size (0.900000 * largest transcript size)...
INFO (constraint_utils 1009): Annotating each transcript with max window size...
2021-09-13 16:46:33 Hail: INFO: Ordering unsorted dataset with network shuffle0]
INFO (constraint_utils 1012): Maximum window size: 2083174===>(6909 + 2) / 6910]
INFO (constraint_utils 1059): HT count: 22750897
INFO (constraint_utils 1290): Checking all possible window sizes... + 2) / 8043]
----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'transcript': str
    'mu_snp': float64
    'total_exp': float64
    '_mu_scan': dict<str, float64>
    'total_mu': float64
    'cumulative_obs': dict<str, int64>
    'observed': int32
    'cumulative_exp': float64
    'total_obs': int64
    'reverse': struct {
        obs: int64,
        exp: float64
    }
    'overall_oe': float64
    'post_window_pos': int32
    'window_end': int32
----------------------------------------
Key: ['locus', 'transcript']
----------------------------------------
INFO (constraint_utils 1382): Window size: 101
INFO (constraint_utils 1383): Checkpointing first temp ht...
INFO (regional_missense_constraint 687): Copying hail log to logging bucket...
2021-09-13 16:52:58 Hail: INFO: copying log to 'gs://regional_missense_constraint/logs/RMC.log'...
Traceback (most recent call last):
  File "/tmp/19569c6f88d54662afe2c56ab0ee51fb/regional_constraint.py", line 807, in <module>
    main(args)
  File "/tmp/19569c6f88d54662afe2c56ab0ee51fb/regional_constraint.py", line 429, in main
    context_ht = search_two_break_windows(
  File "/tmp/19569c6f88d54662afe2c56ab0ee51fb/pyscripts_1j7_3mzj.zip/rmc/utils/constraint.py", line 1384, in search_two_break_windows
  File "<decorator-gen-1119>", line 2, in checkpoint
  File "/opt/conda/default/lib/python3.8/site-packages/hail/typecheck/check.py", line 577, in wrapper
    return __original_func(*args_, **kwargs_)
  File "/opt/conda/default/lib/python3.8/site-packages/hail/table.py", line 1238, in checkpoint
    self.write(output=output, overwrite=overwrite, stage_locally=stage_locally, _codec_spec=_codec_spec)
  File "<decorator-gen-1121>", line 2, in write
  File "/opt/conda/default/lib/python3.8/site-packages/hail/typecheck/check.py", line 577, in wrapper
    return __original_func(*args_, **kwargs_)
  File "/opt/conda/default/lib/python3.8/site-packages/hail/table.py", line 1271, in write
    Env.backend().execute(ir.TableWrite(self._tir, ir.TableNativeWriter(output, overwrite, stage_locally, _codec_spec)))
  File "/opt/conda/default/lib/python3.8/site-packages/hail/backend/py4j_backend.py", line 98, in execute
    raise e
  File "/opt/conda/default/lib/python3.8/site-packages/hail/backend/py4j_backend.py", line 74, in execute
    result = json.loads(self._jhc.backend().executeJSON(jir))
  File "/usr/lib/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1304, in __call__
  File "/opt/conda/default/lib/python3.8/site-packages/hail/backend/py4j_backend.py", line 30, in deco
    raise FatalError('%s\n\nJava stack trace:\n%s\n'
hail.utils.java.FatalError: MethodTooLargeException: Method too large: __C514Compiled.__m516split_Let_region6_8052 (L__C9213__m516split_LetSpills;)V

Java stack trace:
is.hail.relocated.org.objectweb.asm.MethodTooLargeException: Method too large: __C514Compiled.__m516split_Let_region6_8052 (L__C9213__m516split_LetSpills;)V
	at is.hail.relocated.org.objectweb.asm.MethodWriter.computeMethodInfoSize(MethodWriter.java:2087)
	at is.hail.relocated.org.objectweb.asm.ClassWriter.toByteArray(ClassWriter.java:489)
	at is.hail.lir.Emit$.apply(Emit.scala:217)
	at is.hail.lir.Classx.$anonfun$asBytes$4(X.scala:110)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at scala.collection.AbstractIterator.to(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1431)
	at is.hail.lir.Classx.asBytes(X.scala:123)
	at is.hail.asm4s.ClassBuilder.classBytes(ClassBuilder.scala:351)
	at is.hail.asm4s.ModuleBuilder.$anonfun$classesBytes$1(ClassBuilder.scala:152)
	at is.hail.asm4s.ModuleBuilder.$anonfun$classesBytes$1$adapted(ClassBuilder.scala:152)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at scala.collection.AbstractIterator.to(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1431)
	at is.hail.asm4s.ModuleBuilder.classesBytes(ClassBuilder.scala:153)
	at is.hail.expr.ir.EmitClassBuilder.resultWithIndex(EmitClassBuilder.scala:660)
	at is.hail.expr.ir.WrappedEmitClassBuilder.resultWithIndex(EmitClassBuilder.scala:154)
	at is.hail.expr.ir.WrappedEmitClassBuilder.resultWithIndex$(EmitClassBuilder.scala:154)
	at is.hail.expr.ir.EmitFunctionBuilder.resultWithIndex(EmitClassBuilder.scala:1052)
	at is.hail.expr.ir.Compile$.apply(Compile.scala:78)
	at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$_apply$1(CompileAndEvaluate.scala:49)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
	at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:49)
	at is.hail.expr.ir.CompileAndEvaluate$.evalToIR(CompileAndEvaluate.scala:29)
	at is.hail.expr.ir.LowerOrInterpretNonCompilable$.evaluate$1(LowerOrInterpretNonCompilable.scala:29)
	at is.hail.expr.ir.LowerOrInterpretNonCompilable$.rewrite$1(LowerOrInterpretNonCompilable.scala:66)
	at is.hail.expr.ir.LowerOrInterpretNonCompilable$.apply(LowerOrInterpretNonCompilable.scala:71)
	at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.transform(LoweringPass.scala:68)
	at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$3(LoweringPass.scala:15)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
	at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$1(LoweringPass.scala:15)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
	at is.hail.expr.ir.lowering.LoweringPass.apply(LoweringPass.scala:13)
	at is.hail.expr.ir.lowering.LoweringPass.apply$(LoweringPass.scala:12)
	at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.apply(LoweringPass.scala:63)
	at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1(LoweringPipeline.scala:14)
	at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1$adapted(LoweringPipeline.scala:12)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:12)
	at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:46)
	at is.hail.backend.spark.SparkBackend._execute(SparkBackend.scala:381)
	at is.hail.backend.spark.SparkBackend.$anonfun$execute$1(SparkBackend.scala:365)
	at is.hail.expr.ir.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:47)
	at is.hail.utils.package$.using(package.scala:638)
	at is.hail.expr.ir.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:47)
	at is.hail.utils.package$.using(package.scala:638)
	at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
	at is.hail.expr.ir.ExecuteContext$.scoped(ExecuteContext.scala:46)
	at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:275)
	at is.hail.backend.spark.SparkBackend.execute(SparkBackend.scala:362)
	at is.hail.backend.spark.SparkBackend.$anonfun$executeJSON$1(SparkBackend.scala:406)
	at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
	at is.hail.backend.spark.SparkBackend.executeJSON(SparkBackend.scala:404)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



Hail version: 0.2.75-52791e9868e4
Error summary: MethodTooLargeException: Method too large: __C514Compiled.__m516split_Let_region6_8052 (L__C9213__m516split_LetSpills;)V
ERROR: (gcloud.dataproc.jobs.submit.pyspark) Job [19569c6f88d54662afe2c56ab0ee51fb] failed with error:
Google Cloud Dataproc Agent reports job failure. If logs are available, they can be found at:
https://console.cloud.google.com/dataproc/jobs/19569c6f88d54662afe2c56ab0ee51fb?project=broad-mpg-gnomad&region=us-central1
gcloud dataproc jobs wait '19569c6f88d54662afe2c56ab0ee51fb' --region 'us-central1' --project 'broad-mpg-gnomad'
https://console.cloud.google.com/storage/browser/dataproc-faa46220-ec08-4f5b-92bd-9722e1963047-us-central1/google-cloud-dataproc-metainfo/ca21d0ac-2117-4224-a272-a36d7774a17a/jobs/19569c6f88d54662afe2c56ab0ee51fb/
gs://dataproc-faa46220-ec08-4f5b-92bd-9722e1963047-us-central1/google-cloud-dataproc-metainfo/ca21d0ac-2117-4224-a272-a36d7774a17a/jobs/19569c6f88d54662afe2c56ab0ee51fb/driveroutput
Traceback (most recent call last):
  File "/Users/kchao/anaconda3/envs/hail/bin/hailctl", line 8, in <module>
    sys.exit(main())
  File "/Users/kchao/anaconda3/envs/hail/lib/python3.7/site-packages/hailtop/hailctl/__main__.py", line 100, in main
    cli.main(args)
  File "/Users/kchao/anaconda3/envs/hail/lib/python3.7/site-packages/hailtop/hailctl/dataproc/cli.py", line 122, in main
    jmp[args.module].main(args, pass_through_args)
  File "/Users/kchao/anaconda3/envs/hail/lib/python3.7/site-packages/hailtop/hailctl/dataproc/submit.py", line 78, in main
    gcloud.run(cmd)
  File "/Users/kchao/anaconda3/envs/hail/lib/python3.7/site-packages/hailtop/hailctl/dataproc/gcloud.py", line 9, in run
    return subprocess.check_call(["gcloud"] + command)
  File "/Users/kchao/anaconda3/envs/hail/lib/python3.7/subprocess.py", line 328, in check_call
    raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['gcloud', 'dataproc', 'jobs', 'submit', 'pyspark', 'pipeline/regional_constraint.py', '--cluster=kc', '--files=', '--py-files=/var/folders/xq/8jnhrt2s2h58ts2v0br5g8gm0000gp/T/pyscripts_1j7_3mzj.zip', '--properties=', '--', '--search-for-simul-breaks', '--slack-channel', '@kc (she/her)', '--chisq-threshold', '6.6']' returned non-zero exit status 1.

I’ll send the log via email. I don’t think there are any other fields I can drop from that table – is there something else I could try to bypass this new error?

Does this pipeline include the part_min_and_max code we spoke about in the other thread?