I think I’m having the same problem where it’s an issue when reading multiple VCF files but not one. This may have been fixed by a newer version, I’m still using the default UKB RAP 0.2.78. When I try to run:
f_name_regex = 'ukb*_c19_b*_v1.vcf.gz'
mt = hl.import_vcf(
os.path.join(file_dir, f_name_regex),
force_bgz=True,
reference_genome="GRCh38",
array_elements_required=False
)
url = a_valid_url
mt.write(url, overwrite=True)
I get this error:
2023-01-16 02:25:52 Hail: INFO: Coerced sorted dataset
---------------------------------------------------------------------------
FatalError Traceback (most recent call last)
<ipython-input-10-1ce8858653fc> in <module>
6 # it later, the notebook needs to write it into a persistent filesystem (in this case DNAX).
7 # See https://hail.is/docs/0.2/hail.MatrixTable.html#hail.MatrixTable.write for additional documentation.
----> 8 mt.write(url, overwrite=True)
9 # Note: output should describe size of MT (i.e. number of rows, columns, partitions)
<decorator-gen-1275> in write(self, output, overwrite, stage_locally, _codec_spec, _partitions, _checkpoint_file)
/opt/conda/lib/python3.6/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
575 def wrapper(__original_func, *args, **kwargs):
576 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 577 return __original_func(*args_, **kwargs_)
578
579 return wrapper
/opt/conda/lib/python3.6/site-packages/hail/matrixtable.py in write(self, output, overwrite, stage_locally, _codec_spec, _partitions, _checkpoint_file)
2542
2543 writer = ir.MatrixNativeWriter(output, overwrite, stage_locally, _codec_spec, _partitions, _partitions_type, _checkpoint_file)
-> 2544 Env.backend().execute(ir.MatrixWrite(self._mir, writer))
2545
2546 class _Show:
/opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
108 raise HailUserError(message_and_trace) from None
109
--> 110 raise e
/opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
84 # print(self._hail_package.expr.ir.Pretty.apply(jir, True, -1))
85 try:
---> 86 result_tuple = self._jhc.backend().executeEncode(jir, stream_codec)
87 (result, timings) = (result_tuple._1(), result_tuple._2())
88 value = ir.typ._from_encoding(result)
/cluster/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/opt/conda/lib/python3.6/site-packages/hail/backend/py4j_backend.py in deco(*args, **kwargs)
29 raise FatalError('%s\n\nJava stack trace:\n%s\n'
30 'Hail version: %s\n'
---> 31 'Error summary: %s' % (deepest, full, hail.__version__, deepest), error_id) from None
32 except pyspark.sql.utils.CapturedException as e:
33 raise FatalError('%s\n\nJava stack trace:\n%s\n'
FatalError: FileNotFoundException: /mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/ukb23157_c19_b2_v1.vcf.gz (Input/output error)
Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 450 in stage 2.0 failed 4 times, most recent failure: Lost task 450.3 in stage 2.0 (TID 13932, ip-10-60-125-107.eu-west-2.compute.internal, executor 0): java.io.FileNotFoundException: /mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/ukb23157_c19_b2_v1.vcf.gz (Input/output error)
at java.io.FileInputStream.open0(Native Method)
at java.io.FileInputStream.open(FileInputStream.java:195)
at java.io.FileInputStream.<init>(FileInputStream.java:138)
at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileInputStream.<init>(RawLocalFileSystem.java:106)
at org.apache.hadoop.fs.RawLocalFileSystem.open(RawLocalFileSystem.java:202)
at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.<init>(ChecksumFileSystem.java:143)
at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:346)
at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:769)
at is.hail.io.fs.HadoopFS.openNoCompression(HadoopFS.scala:83)
at is.hail.expr.ir.GenericLines$$anonfun$3$$anon$3.<init>(GenericLines.scala:32)
at is.hail.expr.ir.GenericLines$$anonfun$3.apply(GenericLines.scala:29)
at is.hail.expr.ir.GenericLines$$anonfun$3.apply(GenericLines.scala:21)
at is.hail.io.vcf.MatrixVCFReader$$anonfun$21$$anonfun$apply$11.apply(LoadVCF.scala:1746)
at is.hail.io.vcf.MatrixVCFReader$$anonfun$21$$anonfun$apply$11.apply(LoadVCF.scala:1734)
at __C487collect_distributed_array.apply_region0_2(Unknown Source)
at __C487collect_distributed_array.apply(Unknown Source)
at __C487collect_distributed_array.apply(Unknown Source)
at is.hail.backend.BackendUtils$$anonfun$collectDArray$1$$anonfun$apply$1.apply(BackendUtils.scala:31)
at is.hail.backend.BackendUtils$$anonfun$collectDArray$1$$anonfun$apply$1.apply(BackendUtils.scala:30)
at is.hail.utils.package$.using(package.scala:638)
at is.hail.annotations.RegionPool.scopedRegion(RegionPool.scala:144)
at is.hail.backend.BackendUtils$$anonfun$collectDArray$1.apply(BackendUtils.scala:30)
at is.hail.backend.BackendUtils$$anonfun$collectDArray$1.apply(BackendUtils.scala:28)
at is.hail.backend.spark.SparkBackendComputeRDD.compute(SparkBackend.scala:730)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2001)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1984)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1983)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1983)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1033)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1033)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1033)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2223)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2172)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2161)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:823)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
at is.hail.backend.spark.SparkBackend.parallelizeAndComputeWithIndex(SparkBackend.scala:286)
at is.hail.backend.BackendUtils.collectDArray(BackendUtils.scala:28)
at __C446Compiled.__m448split_WriteMetadata_region4_25(Emit.scala)
at __C446Compiled.__m448split_WriteMetadata(Emit.scala)
at __C446Compiled.apply(Emit.scala)
at is.hail.expr.ir.CompileAndEvaluate$$anonfun$_apply$1.apply$mcV$sp(CompileAndEvaluate.scala:57)
at is.hail.expr.ir.CompileAndEvaluate$$anonfun$_apply$1.apply(CompileAndEvaluate.scala:57)
at is.hail.expr.ir.CompileAndEvaluate$$anonfun$_apply$1.apply(CompileAndEvaluate.scala:57)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:57)
at is.hail.expr.ir.CompileAndEvaluate$.evalToIR(CompileAndEvaluate.scala:30)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.evaluate$1(LowerOrInterpretNonCompilable.scala:30)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.is$hail$expr$ir$LowerOrInterpretNonCompilable$$rewrite$1(LowerOrInterpretNonCompilable.scala:67)
at is.hail.expr.ir.LowerOrInterpretNonCompilable$.apply(LowerOrInterpretNonCompilable.scala:72)
at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.transform(LoweringPass.scala:69)
at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:16)
at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:16)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:16)
at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:14)
at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
at is.hail.expr.ir.lowering.LoweringPass$class.apply(LoweringPass.scala:14)
at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.apply(LoweringPass.scala:64)
at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:15)
at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:13)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:13)
at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:47)
at is.hail.backend.spark.SparkBackend.is$hail$backend$spark$SparkBackend$$_execute(SparkBackend.scala:381)
at is.hail.backend.spark.SparkBackend$$anonfun$8$$anonfun$apply$4.apply(SparkBackend.scala:417)
at is.hail.backend.spark.SparkBackend$$anonfun$8$$anonfun$apply$4.apply(SparkBackend.scala:414)
at is.hail.backend.ExecuteContext$$anonfun$scoped$1$$anonfun$apply$1.apply(ExecuteContext.scala:47)
at is.hail.backend.ExecuteContext$$anonfun$scoped$1$$anonfun$apply$1.apply(ExecuteContext.scala:47)
at is.hail.utils.package$.using(package.scala:638)
at is.hail.backend.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:47)
at is.hail.backend.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:46)
at is.hail.utils.package$.using(package.scala:638)
at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:46)
at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:275)
at is.hail.backend.spark.SparkBackend$$anonfun$8.apply(SparkBackend.scala:414)
at is.hail.backend.spark.SparkBackend$$anonfun$8.apply(SparkBackend.scala:413)
at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
at is.hail.backend.spark.SparkBackend.executeEncode(SparkBackend.scala:413)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:750)
java.io.FileNotFoundException: /mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/ukb23157_c19_b2_v1.vcf.gz (Input/output error)
at java.io.FileInputStream.open0(Native Method)
at java.io.FileInputStream.open(FileInputStream.java:195)
at java.io.FileInputStream.<init>(FileInputStream.java:138)
at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileInputStream.<init>(RawLocalFileSystem.java:106)
at org.apache.hadoop.fs.RawLocalFileSystem.open(RawLocalFileSystem.java:202)
at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.<init>(ChecksumFileSystem.java:143)
at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:346)
at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:769)
at is.hail.io.fs.HadoopFS.openNoCompression(HadoopFS.scala:83)
at is.hail.expr.ir.GenericLines$$anonfun$3$$anon$3.<init>(GenericLines.scala:32)
at is.hail.expr.ir.GenericLines$$anonfun$3.apply(GenericLines.scala:29)
at is.hail.expr.ir.GenericLines$$anonfun$3.apply(GenericLines.scala:21)
at is.hail.io.vcf.MatrixVCFReader$$anonfun$21$$anonfun$apply$11.apply(LoadVCF.scala:1746)
at is.hail.io.vcf.MatrixVCFReader$$anonfun$21$$anonfun$apply$11.apply(LoadVCF.scala:1734)
at __C487collect_distributed_array.apply_region0_2(Unknown Source)
at __C487collect_distributed_array.apply(Unknown Source)
at __C487collect_distributed_array.apply(Unknown Source)
at is.hail.backend.BackendUtils$$anonfun$collectDArray$1$$anonfun$apply$1.apply(BackendUtils.scala:31)
at is.hail.backend.BackendUtils$$anonfun$collectDArray$1$$anonfun$apply$1.apply(BackendUtils.scala:30)
at is.hail.utils.package$.using(package.scala:638)
at is.hail.annotations.RegionPool.scopedRegion(RegionPool.scala:144)
at is.hail.backend.BackendUtils$$anonfun$collectDArray$1.apply(BackendUtils.scala:30)
at is.hail.backend.BackendUtils$$anonfun$collectDArray$1.apply(BackendUtils.scala:28)
at is.hail.backend.spark.SparkBackendComputeRDD.compute(SparkBackend.scala:730)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Hail version: 0.2.78-b17627756568
Error summary: FileNotFoundException: /mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/ukb23157_c19_b2_v1.vcf.gz (Input/output error)
The file does exist in that location.