I am getting the same error when I try to run simply ‘import_vcf’ on AWS EMR:
In [2]: v = hl.import_vcf(['s3://seqr-dp-data--prod/vcf/dev/grch38_test.vcf'],
...: reference_genome='GRCh38', contig_recoding={'1': 'chr1', '2': 'chr2', '3': 'chr3', '4': 'chr4', '5': 'chr5', '6': 'chr6', '7':
...: 'chr7', '8': 'chr8', '9': 'chr9', '10': 'chr10', '11': 'chr11', '12': 'chr12', '13': 'chr13', '14': 'chr14', '15': 'chr15', '16': 'chr16', '17': 'chr17', '1
...: 8': 'chr18', '19': 'chr19', '20': 'chr20', '21': 'chr21', '22': 'chr22', 'X': 'chrX', 'Y': 'chrY'},
...: force_bgz=True, min_partitions=500, array_elements_required=False)
with newest hail. I tried running the script that you provided and gives the output:
python 3.7.10
hail 0.2.93
spark 3.1.2
emr 6.5.0
Do you know what could I try here?
I tried the suggestion to uninstall pyspark from previously asked question by me but its unsuccessful this time:
Full error stacktrace:
FatalError Traceback (most recent call last)
<ipython-input-2-42e89978c335> in <module>
1 v = hl.import_vcf(['s3://seqr-dp-data--prod/vcf/dev/grch38_test.vcf'],
2 reference_genome='GRCh38', contig_recoding={'1': 'chr1', '2': 'chr2', '3': 'chr3', '4': 'chr4', '5': 'chr5', '6': 'chr6', '7': 'chr7', '8': 'chr8', '9': 'chr9', '10': 'chr10', '11': 'chr11', '12': 'chr12', '13': 'chr13', '14': 'chr14', '15': 'chr15', '16': 'chr16', '17': 'chr17', '18': 'chr18', '19': 'chr19', '20': 'chr20', '21': 'chr21', '22': 'chr22', 'X': 'chrX', 'Y': 'chrY'},
----> 3 force_bgz=True, min_partitions=500, array_elements_required=False)
<decorator-gen-1464> in import_vcf(path, force, force_bgz, header_file, min_partitions, drop_samples, call_fields, reference_genome, contig_recoding, array_elements_required, skip_invalid_loci, entry_float_type, filter, find_replace, n_partitions, block_size, _partitions)
/usr/local/lib/python3.7/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
575 def wrapper(__original_func, *args, **kwargs):
576 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 577 return __original_func(*args_, **kwargs_)
578
579 return wrapper
/usr/local/lib/python3.7/site-packages/hail/methods/impex.py in import_vcf(path, force, force_bgz, header_file, min_partitions, drop_samples, call_fields, reference_genome, contig_recoding, array_elements_required, skip_invalid_loci, entry_float_type, filter, find_replace, n_partitions, block_size, _partitions)
2734 skip_invalid_loci, force_bgz, force, filter, find_replace,
2735 _partitions)
-> 2736 return MatrixTable(ir.MatrixRead(reader, drop_cols=drop_samples))
2737
2738
/usr/local/lib/python3.7/site-packages/hail/matrixtable.py in __init__(self, mir)
556 self._entry_indices = Indices(self, {self._row_axis, self._col_axis})
557
--> 558 self._type = self._mir.typ
559
560 self._global_type = self._type.global_type
/usr/local/lib/python3.7/site-packages/hail/ir/base_ir.py in typ(self)
359 def typ(self):
360 if self._type is None:
--> 361 self._compute_type()
362 assert self._type is not None, self
363 return self._type
/usr/local/lib/python3.7/site-packages/hail/ir/matrix_ir.py in _compute_type(self)
68 def _compute_type(self):
69 if self._type is None:
---> 70 self._type = Env.backend().matrix_type(self)
71
72
/usr/local/lib/python3.7/site-packages/hail/backend/spark_backend.py in matrix_type(self, mir)
289
290 def matrix_type(self, mir):
--> 291 jir = self._to_java_matrix_ir(mir)
292 return tmatrix._from_java(jir.typ())
293
/usr/local/lib/python3.7/site-packages/hail/backend/spark_backend.py in _to_java_matrix_ir(self, ir)
275
276 def _to_java_matrix_ir(self, ir):
--> 277 return self._to_java_ir(ir, self._parse_matrix_ir)
278
279 def _to_java_blockmatrix_ir(self, ir):
/usr/local/lib/python3.7/site-packages/hail/backend/spark_backend.py in _to_java_ir(self, ir, parse)
265 r = CSERenderer(stop_at_jir=True)
266 # FIXME parse should be static
--> 267 ir._jir = parse(r(ir), ir_map=r.jirs)
268 return ir._jir
269
/usr/local/lib/python3.7/site-packages/hail/backend/spark_backend.py in _parse_matrix_ir(self, code, ref_map, ir_map)
243
244 def _parse_matrix_ir(self, code, ref_map={}, ir_map={}):
--> 245 return self._jbackend.parse_matrix_ir(code, ref_map, ir_map)
246
247 def _parse_blockmatrix_ir(self, code, ref_map={}, ir_map={}):
/usr/local/lib/python3.7/site-packages/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
/usr/local/lib/python3.7/site-packages/hail/backend/py4j_backend.py in deco(*args, **kwargs)
29 tpl = Env.jutils().handleForPython(e.java_exception)
30 deepest, full, error_id = tpl._1(), tpl._2(), tpl._3()
---> 31 raise fatal_error_from_java_error_triplet(deepest, full, error_id) from None
32 except pyspark.sql.utils.CapturedException as e:
33 raise FatalError('%s\n\nJava stack trace:\n%s\n'
FatalError: UnsupportedFileSystemException: No FileSystem for scheme "s3"
Java stack trace:
org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "s3"
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3281)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3301)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:124)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
at is.hail.io.fs.HadoopFS.getFileSystem(HadoopFS.scala:100)
at is.hail.io.fs.HadoopFS.glob(HadoopFS.scala:154)
at is.hail.io.fs.HadoopFS.$anonfun$globAll$1(HadoopFS.scala:136)
at is.hail.io.fs.HadoopFS.$anonfun$globAll$1$adapted(HadoopFS.scala:135)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
at scala.collection.AbstractIterator.to(Iterator.scala:1429)
at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1429)
at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1429)
at is.hail.io.fs.HadoopFS.globAll(HadoopFS.scala:141)
at is.hail.io.vcf.MatrixVCFReader$.apply(LoadVCF.scala:1570)
at is.hail.io.vcf.MatrixVCFReader$.fromJValue(LoadVCF.scala:1666)
at is.hail.expr.ir.MatrixReader$.fromJson(MatrixIR.scala:89)
at is.hail.expr.ir.IRParser$.matrix_ir_1(Parser.scala:1720)
at is.hail.expr.ir.IRParser$.$anonfun$matrix_ir$1(Parser.scala:1646)
at is.hail.utils.StackSafe$More.advance(StackSafe.scala:64)
at is.hail.utils.StackSafe$.run(StackSafe.scala:16)
at is.hail.utils.StackSafe$StackFrame.run(StackSafe.scala:32)
at is.hail.expr.ir.IRParser$.$anonfun$parse_matrix_ir$1(Parser.scala:1986)
at is.hail.expr.ir.IRParser$.parse(Parser.scala:1973)
at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1986)
at is.hail.backend.spark.SparkBackend.$anonfun$parse_matrix_ir$2(SparkBackend.scala:689)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:69)
at is.hail.utils.package$.using(package.scala:638)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:69)
at is.hail.utils.package$.using(package.scala:638)
at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:58)
at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:308)
at is.hail.backend.spark.SparkBackend.$anonfun$parse_matrix_ir$1(SparkBackend.scala:688)
at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
at is.hail.utils.ExecutionTimer$.logTime(ExecutionTimer.scala:59)
at is.hail.backend.spark.SparkBackend.parse_matrix_ir(SparkBackend.scala:687)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:750)
Hail version: 0.2.93-d77cdf0157c9
Error summary: UnsupportedFileSystemException: No FileSystem for scheme "s3"
I tried including 2 jars to classpath:
https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.1/hadoop-aws-2.7.1.jar
http://www.java2s.com/Code/JarDownload/jets3t/jets3t-0.9.0.jar.zip
And there is a new, different error now:
Error summary: IllegalArgumentException: AWS Access Key ID and Secret Access Key must be specified as the username or password (respectively) of a s3 URL, or by setting the fs.s3.awsAccessKeyId or fs.s3.awsSecretAccessKey properties (respectively).
with the stacktrace:
Java stack trace:
java.lang.IllegalArgumentException: AWS Access Key ID and Secret Access Key must be specified as the username or password (respectively) of a s3 URL, or by setting the fs.s3.awsAccessKeyId or fs.s3.awsSecretAccessKey properties (respectively).
at org.apache.hadoop.fs.s3.S3Credentials.initialize(S3Credentials.java:70)
at org.apache.hadoop.fs.s3.Jets3tFileSystemStore.initialize(Jets3tFileSystemStore.java:93)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359)
at com.sun.proxy.$Proxy13.initialize(Unknown Source)
at org.apache.hadoop.fs.s3.S3FileSystem.initialize(S3FileSystem.java:91)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3303)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:124)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
at is.hail.io.fs.HadoopFS.getFileSystem(HadoopFS.scala:100)
at is.hail.io.fs.HadoopFS.glob(HadoopFS.scala:154)
at is.hail.io.fs.HadoopFS.$anonfun$globAll$1(HadoopFS.scala:136)
at is.hail.io.fs.HadoopFS.$anonfun$globAll$1$adapted(HadoopFS.scala:135)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
at scala.collection.AbstractIterator.to(Iterator.scala:1429)
at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1429)
at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1429)
at is.hail.io.fs.HadoopFS.globAll(HadoopFS.scala:141)
at is.hail.io.vcf.MatrixVCFReader$.apply(LoadVCF.scala:1570)
at is.hail.io.vcf.MatrixVCFReader$.fromJValue(LoadVCF.scala:1666)
at is.hail.expr.ir.MatrixReader$.fromJson(MatrixIR.scala:89)
at is.hail.expr.ir.IRParser$.matrix_ir_1(Parser.scala:1720)
at is.hail.expr.ir.IRParser$.$anonfun$matrix_ir$1(Parser.scala:1646)
at is.hail.utils.StackSafe$More.advance(StackSafe.scala:64)
at is.hail.utils.StackSafe$.run(StackSafe.scala:16)
at is.hail.utils.StackSafe$StackFrame.run(StackSafe.scala:32)
at is.hail.expr.ir.IRParser$.$anonfun$parse_matrix_ir$1(Parser.scala:1986)
at is.hail.expr.ir.IRParser$.parse(Parser.scala:1973)
at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1986)
at is.hail.backend.spark.SparkBackend.$anonfun$parse_matrix_ir$2(SparkBackend.scala:689)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:69)
at is.hail.utils.package$.using(package.scala:638)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:69)
at is.hail.utils.package$.using(package.scala:638)
at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:58)
at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:308)
at is.hail.backend.spark.SparkBackend.$anonfun$parse_matrix_ir$1(SparkBackend.scala:688)
at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
at is.hail.utils.ExecutionTimer$.logTime(ExecutionTimer.scala:59)
at is.hail.backend.spark.SparkBackend.parse_matrix_ir(SparkBackend.scala:687)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:750)
How could we add AWS Access Key ID and Secret Access Key into Hail?