Hi, I’m new to Hail and noticing a file not found error that occurs under specific circumstances.
I’m running the GWAS tutorial (Hail | GWAS Tutorial), and everything runs as expected in local mode, but when running on a cluster I get this error when trying to call hl.plot.qq (everything else in the tutorial runs fine). (I did have to modify the data paths because of how my Spark cluster is set up, so that I read from/write to directories that are accessible to my executors.) It appears that the issue stems from a file being written to a /tmp location that is not accessible to the executors/workers (since I get the message /tmp/table-map-rows-scan-aggs-part-iYxclUO3l4CC05U3JTxZKL does not exist, but that file does exist on my local filesystem). Has anyone else encountered this issue and if so do you have any suggestions? Is there a way to change the scratch path that those files get written to (so that I can choose a path that is also accessible to the executors)?
Note: I thought maybe a way around this would be to use collect_all=True in the qqplot options, or to collect the gwas or gwas p-value before plotting, but collect_all gave me a similar error and plotting after collecting also doesn’t seem to resolve the issue (different errors).
Thanks in advance! Copying the full trace below.
2021-07-24 00:56:41 Hail: INFO: Ordering unsorted dataset with network shuffle
FatalError Traceback (most recent call last)
in
----> 1 p = hl.plot.qq(gwas.p_value)
2 show(p)
in qq(pvals, label, title, xlabel, ylabel, size, legend, hover_fields, colors, width, height, collect_all, n_divisions, missing_label)
/opt/conda/lib/python3.8/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
575 def wrapper(original_func, *args, **kwargs):
576 args, kwargs = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
→ 577 return original_func(*args, **kwargs)
578
579 return wrapper
/opt/conda/lib/python3.8/site-packages/hail/plot/plots.py in qq(pvals, label, title, xlabel, ylabel, size, legend, hover_fields, colors, width, height, collect_all, n_divisions, missing_label)
1321 if ‘p’ not in hover_fields:
1322 hover_fields[‘p_value’] = ht[‘p_value’]
→ 1323 p = scatter(
1324 ht.expected_p,
1325 ht.observed_p,
in scatter(x, y, label, title, xlabel, ylabel, size, legend, hover_fields, colors, width, height, collect_all, n_divisions, missing_label)
/opt/conda/lib/python3.8/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
575 def wrapper(original_func, *args, **kwargs):
576 args, kwargs = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
→ 577 return original_func(*args, **kwargs)
578
579 return wrapper
/opt/conda/lib/python3.8/site-packages/hail/plot/plots.py in scatter(x, y, label, title, xlabel, ylabel, size, legend, hover_fields, colors, width, height, collect_all, n_divisions, missing_label)
943 y = (‘y’, y)
944
→ 945 source_pd = _collect_scatter_plot_data(x, y, fields={**hover_fields, **label}, n_divisions=None if collect_all else n_divisions, missing_label=missing_label)
946 sp = figure(title=title, x_axis_label=xlabel, y_axis_label=ylabel, height=height, width=width)
947 sp, sp_legend_items, sp_legend, sp_color_bar, sp_color_mappers, sp_scatter_renderers = _get_scatter_plot_elements(sp, source_pd, x[0], y[0], label_cols, colors, size)
/opt/conda/lib/python3.8/site-packages/hail/plot/plots.py in _collect_scatter_plot_data(x, y, fields, n_divisions, missing_label)
741
742 agg_f = x[1]._aggregation_method()
→ 743 res = agg_f(hail.agg.downsample(x[1], y[1], label=list(expressions.values()) if expressions else None, n_divisions=n_divisions))
744 source_pd = pd.DataFrame([
745 dict(
in aggregate(self, expr, _localize)
/opt/conda/lib/python3.8/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
575 def wrapper(original_func, *args, **kwargs):
576 args, kwargs = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
→ 577 return original_func(*args, **kwargs)
578
579 return wrapper
/opt/conda/lib/python3.8/site-packages/hail/table.py in aggregate(self, expr, _localize)
1176
1177 if _localize:
→ 1178 return Env.backend().execute(agg_ir)
1179
1180 return construct_expr(ir.LiftMeOut(agg_ir), expr.dtype)
/opt/conda/lib/python3.8/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
96 raise HailUserError(message_and_trace) from None
97
—> 98 raise e
/opt/conda/lib/python3.8/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
72 # print(self._hail_package.expr.ir.Pretty.apply(jir, True, -1))
73 try:
—> 74 result = json.loads(self._jhc.backend().executeJSON(jir))
75 value = ir.typ._from_json(result[‘value’])
76 timings = result[‘timings’]
/opt/conda/lib/python3.8/site-packages/py4j/java_gateway.py in call(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
→ 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/opt/conda/lib/python3.8/site-packages/hail/backend/py4j_backend.py in deco(*args, **kwargs)
28 raise FatalError(‘Error summary: %s’ % (deepest,), error_id) from None
29 else:
—> 30 raise FatalError(’%s\n\nJava stack trace:\n%s\n’
31 ‘Hail version: %s\n’
32 ‘Error summary: %s’ % (deepest, full, hail.version, deepest), error_id) from None
FatalError: FileNotFoundException: File /tmp/table-map-rows-scan-aggs-part-iYxclUO3l4CC05U3JTxZKL does not exist
Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 164.0 failed 4 times, most recent failure: Lost task 0.3 in stage 164.0 (TID 271) (100.64.88.33 executor 0): java.io.FileNotFoundException: File /tmp/table-map-rows-scan-aggs-part-iYxclUO3l4CC05U3JTxZKL does not exist
at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:666)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:987)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:656)
at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:454)
at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.(ChecksumFileSystem.java:146)
at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:347)
at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:899)
at is.hail.io.fs.HadoopFS.openNoCompression(HadoopFS.scala:83)
at is.hail.expr.ir.TableMapRows.$anonfun$execute$66(TableIR.scala:2161)
at is.hail.expr.ir.TableMapRows.$anonfun$execute$66$adapted(TableIR.scala:2155)
at is.hail.sparkextras.ContextRDD.$anonfun$cmapPartitionsWithIndexAndValue$2(ContextRDD.scala:270)
at is.hail.sparkextras.ContextRDD.$anonfun$cmapPartitions$3(ContextRDD.scala:218)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
at is.hail.utils.richUtils.RichContextRDD$$anon$1.hasNext(RichContextRDD.scala:71)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:221)
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)
at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1423)
at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1350)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1414)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1237)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:384)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:335)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2253)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2202)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2201)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2201)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2440)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2382)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2371)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2297)
at is.hail.rvd.RVD.combine(RVD.scala:725)
at is.hail.expr.ir.Interpret$.run(Interpret.scala:913)
at is.hail.expr.ir.Interpret$.alreadyLowered(Interpret.scala:56)
at is.hail.expr.ir.InterpretNonCompilable$.interpretAndCoerce$1(InterpretNonCompilable.scala:16)
at is.hail.expr.ir.InterpretNonCompilable$.rewrite$1(InterpretNonCompilable.scala:53)
at is.hail.expr.ir.InterpretNonCompilable$.apply(InterpretNonCompilable.scala:58)
at is.hail.expr.ir.lowering.InterpretNonCompilablePass$.transform(LoweringPass.scala:67)
at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$3(LoweringPass.scala:15)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$1(LoweringPass.scala:15)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass.apply(LoweringPass.scala:13)
at is.hail.expr.ir.lowering.LoweringPass.apply$(LoweringPass.scala:12)
at is.hail.expr.ir.lowering.InterpretNonCompilablePass$.apply(LoweringPass.scala:62)
at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1(LoweringPipeline.scala:14)
at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1$adapted(LoweringPipeline.scala:12)
at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:12)
at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:29)
at is.hail.backend.spark.SparkBackend._execute(SparkBackend.scala:381)
at is.hail.backend.spark.SparkBackend.$anonfun$execute$1(SparkBackend.scala:365)
at is.hail.expr.ir.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:47)
at is.hail.utils.package$.using(package.scala:627)
at is.hail.expr.ir.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:47)
at is.hail.utils.package$.using(package.scala:627)
at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
at is.hail.expr.ir.ExecuteContext$.scoped(ExecuteContext.scala:46)
at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:275)
at is.hail.backend.spark.SparkBackend.execute(SparkBackend.scala:362)
at is.hail.backend.spark.SparkBackend.$anonfun$executeJSON$1(SparkBackend.scala:406)
at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
at is.hail.backend.spark.SparkBackend.executeJSON(SparkBackend.scala:404)
at sun.reflect.GeneratedMethodAccessor41.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
java.io.FileNotFoundException: File /tmp/table-map-rows-scan-aggs-part-iYxclUO3l4CC05U3JTxZKL does not exist
at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:666)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:987)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:656)
at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:454)
at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.(ChecksumFileSystem.java:146)
at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:347)
at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:899)
at is.hail.io.fs.HadoopFS.openNoCompression(HadoopFS.scala:83)
at is.hail.expr.ir.TableMapRows.$anonfun$execute$66(TableIR.scala:2161)
at is.hail.expr.ir.TableMapRows.$anonfun$execute$66$adapted(TableIR.scala:2155)
at is.hail.sparkextras.ContextRDD.$anonfun$cmapPartitionsWithIndexAndValue$2(ContextRDD.scala:270)
at is.hail.sparkextras.ContextRDD.$anonfun$cmapPartitions$3(ContextRDD.scala:218)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
at is.hail.utils.richUtils.RichContextRDD$$anon$1.hasNext(RichContextRDD.scala:71)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:221)
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)
at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1423)
at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1350)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1414)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1237)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:384)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:335)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Hail version: 0.2.73-2ab84582d24d
Error summary: FileNotFoundException: File /tmp/table-map-rows-scan-aggs-part-iYxclUO3l4CC05U3JTxZKL does not exist