Editor’s Note:
The Hail team does not recommend the solution posted here, please read the entire thread for details and possible alternatives.
~ @danking
This may be an issue with the UKB RAP but I cannot tell. In the simplest case, I am just trying to read and write a MatrixTable as:
import pyspark
import dxpy
import subprocess
import hail as hl
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
hl.init(sc=sc, default_reference='GRCh38')
vcf_file = ['file:///mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format/ukb23156_c19_b46_v1.vcf.gz',]
region = [
hl.parse_locus_interval(
"[chr19:45668221-chr19:45683722]"
)
]
mts = hl.import_gvcfs(
vcf_file,
partitions=region,
reference_genome="GRCh38",
array_elements_required=False,
)
mt = mts[0]
print(mt.count())
mt.write("file:/opt/notebooks/GIPR.mt")
subprocess.run(["dx", "upload", "/opt/notebooks/GIPR.mt", "-r", "--path", "/"], check = True, shell = False)
pip-installed Hail requires additional configuration options in Spark referring
to the path to the Hail Python module directory HAIL_DIR,
e.g. /path/to/python/site-packages/hail:
spark.jars=HAIL_DIR/hail-all-spark.jar
spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-47-217.eu-west-2.compute.internal:8081
Welcome to
__ __ <>__
/ /_/ /__ __/ /
/ __ / _ `/ / /
/_/ /_/\_,_/_/_/ version 0.2.61-3c86d3ba497a
LOGGING: writing to /opt/notebooks/gogoGPCR/notebooks/hail-20210920-1148-0.2.61-3c86d3ba497a.log
(828, 200643)
2021-09-20 11:56:29 Hail: INFO: wrote matrix table with 828 rows and 200643 columns in 1 partition to file:/opt/notebooks/GIPR.mt
Total size: 452.53 MiB
* Rows/entries: 451.40 MiB
* Columns: 1.13 MiB
* Globals: 11.00 B
* Smallest partition: 828 rows (451.40 MiB)
* Largest partition: 828 rows (451.40 MiB)
CompletedProcess(args=['dx', 'upload', '/opt/notebooks/GIPR.mt', '-r', '--path', '/'], returncode=0)
I have attached the log for writing and reading the MatrixTable from the same environment. Trying to read the same MatrixTable in a different environment gives:
dxpy.download_folder("project-xxx", "/opt/notebooks/GIPR.mt", "/GIPR.mt")
mt = hl.read_matrix_table("file:/opt/notebooks/GIPR.mt")
mt.show()
---------------------------------------------------------------------------
FatalError Traceback (most recent call last)
/opt/conda/lib/python3.6/site-packages/IPython/core/formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
/opt/conda/lib/python3.6/site-packages/IPython/lib/pretty.py in pretty(self, obj)
392 if cls is not object \
393 and callable(cls.__dict__.get('__repr__')):
--> 394 return _repr_pprint(obj, self, cycle)
395
396 return _default_pprint(obj, self, cycle)
/opt/conda/lib/python3.6/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
698 """A pprint that just redirects to the normal repr function."""
699 # Find newlines and replace them with p.break_()
--> 700 output = repr(obj)
701 lines = output.splitlines()
702 with p.group():
/opt/conda/lib/python3.6/site-packages/hail/matrixtable.py in __repr__(self)
2541
2542 def __repr__(self):
-> 2543 return self.__str__()
2544
2545 def _repr_html_(self):
/opt/conda/lib/python3.6/site-packages/hail/matrixtable.py in __str__(self)
2535
2536 def __str__(self):
-> 2537 s = self.table_show.__str__()
2538 if self.displayed_n_cols != self.actual_n_cols:
2539 s += f"showing the first { self.displayed_n_cols } of { self.actual_n_cols } columns"
/opt/conda/lib/python3.6/site-packages/hail/table.py in __str__(self)
1292
1293 def __str__(self):
-> 1294 return self._ascii_str()
1295
1296 def __repr__(self):
/opt/conda/lib/python3.6/site-packages/hail/table.py in _ascii_str(self)
1318 return s
1319
-> 1320 rows, has_more, dtype = self.data()
1321 fields = list(dtype)
1322 trunc_fields = [trunc(f) for f in fields]
/opt/conda/lib/python3.6/site-packages/hail/table.py in data(self)
1302 row_dtype = t.row.dtype
1303 t = t.select(**{k: hl._showstr(v) for (k, v) in t.row.items()})
-> 1304 rows, has_more = t._take_n(self.n)
1305 self._data = (rows, has_more, row_dtype)
1306 return self._data
/opt/conda/lib/python3.6/site-packages/hail/table.py in _take_n(self, n)
1449 has_more = False
1450 else:
-> 1451 rows = self.take(n + 1)
1452 has_more = len(rows) > n
1453 rows = rows[:n]
<decorator-gen-1119> in take(self, n, _localize)
/opt/conda/lib/python3.6/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
612 def wrapper(__original_func, *args, **kwargs):
613 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 614 return __original_func(*args_, **kwargs_)
615
616 return wrapper
/opt/conda/lib/python3.6/site-packages/hail/table.py in take(self, n, _localize)
2119 """
2120
-> 2121 return self.head(n).collect(_localize)
2122
2123 @typecheck_method(n=int)
<decorator-gen-1113> in collect(self, _localize)
/opt/conda/lib/python3.6/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
612 def wrapper(__original_func, *args, **kwargs):
613 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 614 return __original_func(*args_, **kwargs_)
615
616 return wrapper
/opt/conda/lib/python3.6/site-packages/hail/table.py in collect(self, _localize)
1918 e = construct_expr(rows_ir, hl.tarray(t.row.dtype))
1919 if _localize:
-> 1920 return Env.backend().execute(e._ir)
1921 else:
1922 return e
/opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
96 raise HailUserError(message_and_trace) from None
97
---> 98 raise e
/opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
72 # print(self._hail_package.expr.ir.Pretty.apply(jir, True, -1))
73 try:
---> 74 result = json.loads(self._jhc.backend().executeJSON(jir))
75 value = ir.typ._from_json(result['value'])
76 timings = result['timings']
/cluster/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in deco(*args, **kwargs)
30 raise FatalError('%s\n\nJava stack trace:\n%s\n'
31 'Hail version: %s\n'
---> 32 'Error summary: %s' % (deepest, full, hail.__version__, deepest), error_id) from None
33 except pyspark.sql.utils.CapturedException as e:
34 raise FatalError('%s\n\nJava stack trace:\n%s\n'
FatalError: FileNotFoundException: File file:/opt/notebooks/GIPR.mt/rows/rows/parts/part-0-2-0-0-ba507024-1211-ab56-179a-832d6e98beb7 does not exist
Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3, ip-10-60-2-89.eu-west-2.compute.internal, executor 0): java.io.FileNotFoundException: File file:/opt/notebooks/GIPR.mt/rows/rows/parts/part-0-2-0-0-ba507024-1211-ab56-179a-832d6e98beb7 does not exist
at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:611)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:824)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:601)
at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:421)
at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.<init>(ChecksumFileSystem.java:142)
at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:346)
at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:769)
at is.hail.io.fs.HadoopFS.openNoCompression(HadoopFS.scala:83)
at is.hail.io.fs.FS$class.open(FS.scala:139)
at is.hail.io.fs.HadoopFS.open(HadoopFS.scala:70)
at is.hail.io.fs.FS$class.open(FS.scala:148)
at is.hail.io.fs.HadoopFS.open(HadoopFS.scala:70)
at is.hail.HailContext$$anon$1.compute(HailContext.scala:276)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2001)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1984)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1983)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1983)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1033)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1033)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1033)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2223)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2172)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2161)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:823)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at is.hail.sparkextras.ContextRDD.runJob(ContextRDD.scala:351)
at is.hail.rvd.RVD$$anonfun$13.apply(RVD.scala:526)
at is.hail.rvd.RVD$$anonfun$13.apply(RVD.scala:526)
at is.hail.utils.PartitionCounts$.incrementalPCSubsetOffset(PartitionCounts.scala:73)
at is.hail.rvd.RVD.head(RVD.scala:525)
at is.hail.expr.ir.TableSubset$class.execute(TableIR.scala:1326)
at is.hail.expr.ir.TableHead.execute(TableIR.scala:1332)
at is.hail.expr.ir.TableMapRows.execute(TableIR.scala:1845)
at is.hail.expr.ir.Interpret$.run(Interpret.scala:819)
at is.hail.expr.ir.Interpret$.alreadyLowered(Interpret.scala:53)
at is.hail.expr.ir.InterpretNonCompilable$.interpretAndCoerce$1(InterpretNonCompilable.scala:16)
at is.hail.expr.ir.InterpretNonCompilable$.is$hail$expr$ir$InterpretNonCompilable$$rewrite$1(InterpretNonCompilable.scala:53)
at is.hail.expr.ir.InterpretNonCompilable$$anonfun$1.apply(InterpretNonCompilable.scala:25)
at is.hail.expr.ir.InterpretNonCompilable$$anonfun$1.apply(InterpretNonCompilable.scala:25)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at is.hail.expr.ir.InterpretNonCompilable$.rewriteChildren$1(InterpretNonCompilable.scala:25)
at is.hail.expr.ir.InterpretNonCompilable$.is$hail$expr$ir$InterpretNonCompilable$$rewrite$1(InterpretNonCompilable.scala:54)
at is.hail.expr.ir.InterpretNonCompilable$.apply(InterpretNonCompilable.scala:58)
at is.hail.expr.ir.lowering.InterpretNonCompilablePass$.transform(LoweringPass.scala:67)
at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:15)
at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:15)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:15)
at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:13)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass$class.apply(LoweringPass.scala:13)
at is.hail.expr.ir.lowering.InterpretNonCompilablePass$.apply(LoweringPass.scala:62)
at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:14)
at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:12)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:12)
at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:28)
at is.hail.backend.spark.SparkBackend.is$hail$backend$spark$SparkBackend$$_execute(SparkBackend.scala:354)
at is.hail.backend.spark.SparkBackend$$anonfun$execute$1.apply(SparkBackend.scala:338)
at is.hail.backend.spark.SparkBackend$$anonfun$execute$1.apply(SparkBackend.scala:335)
at is.hail.expr.ir.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:25)
at is.hail.expr.ir.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:23)
at is.hail.utils.package$.using(package.scala:618)
at is.hail.annotations.Region$.scoped(Region.scala:18)
at is.hail.expr.ir.ExecuteContext$.scoped(ExecuteContext.scala:23)
at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:247)
at is.hail.backend.spark.SparkBackend.execute(SparkBackend.scala:335)
at is.hail.backend.spark.SparkBackend$$anonfun$7.apply(SparkBackend.scala:379)
at is.hail.backend.spark.SparkBackend$$anonfun$7.apply(SparkBackend.scala:377)
at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
at is.hail.backend.spark.SparkBackend.executeJSON(SparkBackend.scala:377)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
java.io.FileNotFoundException: File file:/opt/notebooks/GIPR.mt/rows/rows/parts/part-0-2-0-0-ba507024-1211-ab56-179a-832d6e98beb7 does not exist
at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:611)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:824)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:601)
at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:421)
at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.<init>(ChecksumFileSystem.java:142)
at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:346)
at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:769)
at is.hail.io.fs.HadoopFS.openNoCompression(HadoopFS.scala:83)
at is.hail.io.fs.FS$class.open(FS.scala:139)
at is.hail.io.fs.HadoopFS.open(HadoopFS.scala:70)
at is.hail.io.fs.FS$class.open(FS.scala:148)
at is.hail.io.fs.HadoopFS.open(HadoopFS.scala:70)
at is.hail.HailContext$$anon$1.compute(HailContext.scala:276)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Hail version: 0.2.61-3c86d3ba497a
Error summary: FileNotFoundException: File file:/opt/notebooks/GIPR.mt/rows/rows/parts/part-0-2-0-0-ba507024-1211-ab56-179a-832d6e98beb7 does not exist
---------------------------------------------------------------------------
FatalError Traceback (most recent call last)
/opt/conda/lib/python3.6/site-packages/IPython/core/formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
/opt/conda/lib/python3.6/site-packages/hail/matrixtable.py in _repr_html_(self)
2544
2545 def _repr_html_(self):
-> 2546 s = self.table_show._repr_html_()
2547 if self.displayed_n_cols != self.actual_n_cols:
2548 s += '<p style="background: #fdd; padding: 0.4em;">'
/opt/conda/lib/python3.6/site-packages/hail/table.py in _repr_html_(self)
1307
1308 def _repr_html_(self):
-> 1309 return self._html_str()
1310
1311 def _ascii_str(self):
/opt/conda/lib/python3.6/site-packages/hail/table.py in _html_str(self)
1397 types = self.types
1398
-> 1399 rows, has_more, dtype = self.data()
1400 fields = list(dtype)
1401
/opt/conda/lib/python3.6/site-packages/hail/table.py in data(self)
1302 row_dtype = t.row.dtype
1303 t = t.select(**{k: hl._showstr(v) for (k, v) in t.row.items()})
-> 1304 rows, has_more = t._take_n(self.n)
1305 self._data = (rows, has_more, row_dtype)
1306 return self._data
/opt/conda/lib/python3.6/site-packages/hail/table.py in _take_n(self, n)
1449 has_more = False
1450 else:
-> 1451 rows = self.take(n + 1)
1452 has_more = len(rows) > n
1453 rows = rows[:n]
<decorator-gen-1119> in take(self, n, _localize)
/opt/conda/lib/python3.6/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
612 def wrapper(__original_func, *args, **kwargs):
613 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 614 return __original_func(*args_, **kwargs_)
615
616 return wrapper
/opt/conda/lib/python3.6/site-packages/hail/table.py in take(self, n, _localize)
2119 """
2120
-> 2121 return self.head(n).collect(_localize)
2122
2123 @typecheck_method(n=int)
<decorator-gen-1113> in collect(self, _localize)
/opt/conda/lib/python3.6/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
612 def wrapper(__original_func, *args, **kwargs):
613 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 614 return __original_func(*args_, **kwargs_)
615
616 return wrapper
/opt/conda/lib/python3.6/site-packages/hail/table.py in collect(self, _localize)
1918 e = construct_expr(rows_ir, hl.tarray(t.row.dtype))
1919 if _localize:
-> 1920 return Env.backend().execute(e._ir)
1921 else:
1922 return e
/opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
96 raise HailUserError(message_and_trace) from None
97
---> 98 raise e
/opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
72 # print(self._hail_package.expr.ir.Pretty.apply(jir, True, -1))
73 try:
---> 74 result = json.loads(self._jhc.backend().executeJSON(jir))
75 value = ir.typ._from_json(result['value'])
76 timings = result['timings']
/cluster/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in deco(*args, **kwargs)
30 raise FatalError('%s\n\nJava stack trace:\n%s\n'
31 'Hail version: %s\n'
---> 32 'Error summary: %s' % (deepest, full, hail.__version__, deepest), error_id) from None
33 except pyspark.sql.utils.CapturedException as e:
34 raise FatalError('%s\n\nJava stack trace:\n%s\n'
FatalError: FileNotFoundException: File file:/opt/notebooks/GIPR.mt/rows/rows/parts/part-0-2-0-0-ba507024-1211-ab56-179a-832d6e98beb7 does not exist
SHORTENED DUE TO CHARACTER LIMIT. LOG ATTACHED.
Hail version: 0.2.61-3c86d3ba497a
Error summary: FileNotFoundException: File file:/opt/notebooks/GIPR.mt/rows/rows/parts/part-0-2-0-0-ba507024-1211-ab56-179a-832d6e98beb7 does not exist
I cannot for the life of me figure out what is going on. It seems the rows/rows/parts/ folder is empty somehow? I hope it makes sense but let me know if it does not.
Also, info on the Jupyter environment is here.
read_MatrixTable_same_environment.log (192.6 KB)
write_MatrixTable.log (178.0 KB)
read_matrixtable.log (197.2 KB)