Error summary: UnsupportedFileSystemException: No FileSystem for scheme "s3"

I am getting the same error when I try to run simply ‘import_vcf’ on AWS EMR:

In [2]: v = hl.import_vcf(['s3://seqr-dp-data--prod/vcf/dev/grch38_test.vcf'],
   ...:                              reference_genome='GRCh38', contig_recoding={'1': 'chr1', '2': 'chr2', '3': 'chr3', '4': 'chr4', '5': 'chr5', '6': 'chr6', '7':
   ...: 'chr7', '8': 'chr8', '9': 'chr9', '10': 'chr10', '11': 'chr11', '12': 'chr12', '13': 'chr13', '14': 'chr14', '15': 'chr15', '16': 'chr16', '17': 'chr17', '1
   ...: 8': 'chr18', '19': 'chr19', '20': 'chr20', '21': 'chr21', '22': 'chr22', 'X': 'chrX', 'Y': 'chrY'},
   ...:                              force_bgz=True, min_partitions=500, array_elements_required=False)

with newest hail. I tried running the script that you provided and gives the output:

python 3.7.10
hail 0.2.93
spark 3.1.2
emr 6.5.0

Do you know what could I try here?

I tried the suggestion to uninstall pyspark from previously asked question by me but its unsuccessful this time:

Full error stacktrace:

FatalError                                Traceback (most recent call last)
<ipython-input-2-42e89978c335> in <module>
      1 v = hl.import_vcf(['s3://seqr-dp-data--prod/vcf/dev/grch38_test.vcf'],
      2                              reference_genome='GRCh38', contig_recoding={'1': 'chr1', '2': 'chr2', '3': 'chr3', '4': 'chr4', '5': 'chr5', '6': 'chr6', '7': 'chr7', '8': 'chr8', '9': 'chr9', '10': 'chr10', '11': 'chr11', '12': 'chr12', '13': 'chr13', '14': 'chr14', '15': 'chr15', '16': 'chr16', '17': 'chr17', '18': 'chr18', '19': 'chr19', '20': 'chr20', '21': 'chr21', '22': 'chr22', 'X': 'chrX', 'Y': 'chrY'},
----> 3                              force_bgz=True, min_partitions=500, array_elements_required=False)

<decorator-gen-1464> in import_vcf(path, force, force_bgz, header_file, min_partitions, drop_samples, call_fields, reference_genome, contig_recoding, array_elements_required, skip_invalid_loci, entry_float_type, filter, find_replace, n_partitions, block_size, _partitions)

/usr/local/lib/python3.7/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
    575     def wrapper(__original_func, *args, **kwargs):
    576         args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 577         return __original_func(*args_, **kwargs_)
    578 
    579     return wrapper

/usr/local/lib/python3.7/site-packages/hail/methods/impex.py in import_vcf(path, force, force_bgz, header_file, min_partitions, drop_samples, call_fields, reference_genome, contig_recoding, array_elements_required, skip_invalid_loci, entry_float_type, filter, find_replace, n_partitions, block_size, _partitions)
   2734                                 skip_invalid_loci, force_bgz, force, filter, find_replace,
   2735                                 _partitions)
-> 2736     return MatrixTable(ir.MatrixRead(reader, drop_cols=drop_samples))
   2737 
   2738 

/usr/local/lib/python3.7/site-packages/hail/matrixtable.py in __init__(self, mir)
    556         self._entry_indices = Indices(self, {self._row_axis, self._col_axis})
    557 
--> 558         self._type = self._mir.typ
    559 
    560         self._global_type = self._type.global_type

/usr/local/lib/python3.7/site-packages/hail/ir/base_ir.py in typ(self)
    359     def typ(self):
    360         if self._type is None:
--> 361             self._compute_type()
    362             assert self._type is not None, self
    363         return self._type

/usr/local/lib/python3.7/site-packages/hail/ir/matrix_ir.py in _compute_type(self)
     68     def _compute_type(self):
     69         if self._type is None:
---> 70             self._type = Env.backend().matrix_type(self)
     71 
     72 

/usr/local/lib/python3.7/site-packages/hail/backend/spark_backend.py in matrix_type(self, mir)
    289 
    290     def matrix_type(self, mir):
--> 291         jir = self._to_java_matrix_ir(mir)
    292         return tmatrix._from_java(jir.typ())
    293 

/usr/local/lib/python3.7/site-packages/hail/backend/spark_backend.py in _to_java_matrix_ir(self, ir)
    275 
    276     def _to_java_matrix_ir(self, ir):
--> 277         return self._to_java_ir(ir, self._parse_matrix_ir)
    278 
    279     def _to_java_blockmatrix_ir(self, ir):

/usr/local/lib/python3.7/site-packages/hail/backend/spark_backend.py in _to_java_ir(self, ir, parse)
    265             r = CSERenderer(stop_at_jir=True)
    266             # FIXME parse should be static
--> 267             ir._jir = parse(r(ir), ir_map=r.jirs)
    268         return ir._jir
    269 

/usr/local/lib/python3.7/site-packages/hail/backend/spark_backend.py in _parse_matrix_ir(self, code, ref_map, ir_map)
    243 
    244     def _parse_matrix_ir(self, code, ref_map={}, ir_map={}):
--> 245         return self._jbackend.parse_matrix_ir(code, ref_map, ir_map)
    246 
    247     def _parse_blockmatrix_ir(self, code, ref_map={}, ir_map={}):

/usr/local/lib/python3.7/site-packages/py4j/java_gateway.py in __call__(self, *args)
   1303         answer = self.gateway_client.send_command(command)
   1304         return_value = get_return_value(
-> 1305             answer, self.gateway_client, self.target_id, self.name)
   1306 
   1307         for temp_arg in temp_args:

/usr/local/lib/python3.7/site-packages/hail/backend/py4j_backend.py in deco(*args, **kwargs)
     29             tpl = Env.jutils().handleForPython(e.java_exception)
     30             deepest, full, error_id = tpl._1(), tpl._2(), tpl._3()
---> 31             raise fatal_error_from_java_error_triplet(deepest, full, error_id) from None
     32         except pyspark.sql.utils.CapturedException as e:
     33             raise FatalError('%s\n\nJava stack trace:\n%s\n'

FatalError: UnsupportedFileSystemException: No FileSystem for scheme "s3"

Java stack trace:
org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "s3"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3281)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3301)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:124)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
	at is.hail.io.fs.HadoopFS.getFileSystem(HadoopFS.scala:100)
	at is.hail.io.fs.HadoopFS.glob(HadoopFS.scala:154)
	at is.hail.io.fs.HadoopFS.$anonfun$globAll$1(HadoopFS.scala:136)
	at is.hail.io.fs.HadoopFS.$anonfun$globAll$1$adapted(HadoopFS.scala:135)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at scala.collection.AbstractIterator.to(Iterator.scala:1429)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1429)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1429)
	at is.hail.io.fs.HadoopFS.globAll(HadoopFS.scala:141)
	at is.hail.io.vcf.MatrixVCFReader$.apply(LoadVCF.scala:1570)
	at is.hail.io.vcf.MatrixVCFReader$.fromJValue(LoadVCF.scala:1666)
	at is.hail.expr.ir.MatrixReader$.fromJson(MatrixIR.scala:89)
	at is.hail.expr.ir.IRParser$.matrix_ir_1(Parser.scala:1720)
	at is.hail.expr.ir.IRParser$.$anonfun$matrix_ir$1(Parser.scala:1646)
	at is.hail.utils.StackSafe$More.advance(StackSafe.scala:64)
	at is.hail.utils.StackSafe$.run(StackSafe.scala:16)
	at is.hail.utils.StackSafe$StackFrame.run(StackSafe.scala:32)
	at is.hail.expr.ir.IRParser$.$anonfun$parse_matrix_ir$1(Parser.scala:1986)
	at is.hail.expr.ir.IRParser$.parse(Parser.scala:1973)
	at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1986)
	at is.hail.backend.spark.SparkBackend.$anonfun$parse_matrix_ir$2(SparkBackend.scala:689)
	at is.hail.backend.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:69)
	at is.hail.utils.package$.using(package.scala:638)
	at is.hail.backend.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:69)
	at is.hail.utils.package$.using(package.scala:638)
	at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
	at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:58)
	at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:308)
	at is.hail.backend.spark.SparkBackend.$anonfun$parse_matrix_ir$1(SparkBackend.scala:688)
	at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
	at is.hail.utils.ExecutionTimer$.logTime(ExecutionTimer.scala:59)
	at is.hail.backend.spark.SparkBackend.parse_matrix_ir(SparkBackend.scala:687)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)



Hail version: 0.2.93-d77cdf0157c9
Error summary: UnsupportedFileSystemException: No FileSystem for scheme "s3"

I tried including 2 jars to classpath:

https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.1/hadoop-aws-2.7.1.jar
http://www.java2s.com/Code/JarDownload/jets3t/jets3t-0.9.0.jar.zip

And there is a new, different error now:

Error summary: IllegalArgumentException: AWS Access Key ID and Secret Access Key must be specified as the username or password (respectively) of a s3 URL, or by setting the fs.s3.awsAccessKeyId or fs.s3.awsSecretAccessKey properties (respectively).

with the stacktrace:

Java stack trace:
java.lang.IllegalArgumentException: AWS Access Key ID and Secret Access Key must be specified as the username or password (respectively) of a s3 URL, or by setting the fs.s3.awsAccessKeyId or fs.s3.awsSecretAccessKey properties (respectively).
	at org.apache.hadoop.fs.s3.S3Credentials.initialize(S3Credentials.java:70)
	at org.apache.hadoop.fs.s3.Jets3tFileSystemStore.initialize(Jets3tFileSystemStore.java:93)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422)
	at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165)
	at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157)
	at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)
	at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359)
	at com.sun.proxy.$Proxy13.initialize(Unknown Source)
	at org.apache.hadoop.fs.s3.S3FileSystem.initialize(S3FileSystem.java:91)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3303)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:124)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
	at is.hail.io.fs.HadoopFS.getFileSystem(HadoopFS.scala:100)
	at is.hail.io.fs.HadoopFS.glob(HadoopFS.scala:154)
	at is.hail.io.fs.HadoopFS.$anonfun$globAll$1(HadoopFS.scala:136)
	at is.hail.io.fs.HadoopFS.$anonfun$globAll$1$adapted(HadoopFS.scala:135)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at scala.collection.AbstractIterator.to(Iterator.scala:1429)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1429)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1429)
	at is.hail.io.fs.HadoopFS.globAll(HadoopFS.scala:141)
	at is.hail.io.vcf.MatrixVCFReader$.apply(LoadVCF.scala:1570)
	at is.hail.io.vcf.MatrixVCFReader$.fromJValue(LoadVCF.scala:1666)
	at is.hail.expr.ir.MatrixReader$.fromJson(MatrixIR.scala:89)
	at is.hail.expr.ir.IRParser$.matrix_ir_1(Parser.scala:1720)
	at is.hail.expr.ir.IRParser$.$anonfun$matrix_ir$1(Parser.scala:1646)
	at is.hail.utils.StackSafe$More.advance(StackSafe.scala:64)
	at is.hail.utils.StackSafe$.run(StackSafe.scala:16)
	at is.hail.utils.StackSafe$StackFrame.run(StackSafe.scala:32)
	at is.hail.expr.ir.IRParser$.$anonfun$parse_matrix_ir$1(Parser.scala:1986)
	at is.hail.expr.ir.IRParser$.parse(Parser.scala:1973)
	at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1986)
	at is.hail.backend.spark.SparkBackend.$anonfun$parse_matrix_ir$2(SparkBackend.scala:689)
	at is.hail.backend.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:69)
	at is.hail.utils.package$.using(package.scala:638)
	at is.hail.backend.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:69)
	at is.hail.utils.package$.using(package.scala:638)
	at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
	at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:58)
	at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:308)
	at is.hail.backend.spark.SparkBackend.$anonfun$parse_matrix_ir$1(SparkBackend.scala:688)
	at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
	at is.hail.utils.ExecutionTimer$.logTime(ExecutionTimer.scala:59)
	at is.hail.backend.spark.SparkBackend.parse_matrix_ir(SparkBackend.scala:687)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)

How could we add AWS Access Key ID and Secret Access Key into Hail?