Hi!
I am trying to read a bgen file using the following line
hl.import_bgen(path = 's3a://bucket/' + bgen_file, sample_file = 's3a://bucket' + sample_file, entry_fields = ['dosage'])
I downloaded the S3 connector jars: hadoop-aws-3.3.2.jar and aws-java-sdk-bundle-1.12.163.jar. The contents of the spark-defaults.conf
file is
### START: DO NOT EDIT, MANAGED BY: install-s3-connector.sh
spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider
### END: DO NOT EDIT, MANAGED BY: install-s3-connector.sh
But I keep getting the following error
---------------------------------------------------------------------------
FatalError Traceback (most recent call last)
<ipython-input-4-da0236d38576> in <module>
1 bgen_file = 'ukb_imp_chrXY_v3.bgen'
2 samp_file = 'ukb31028_imp_chrXY_v3_s486359.sample'
----> 3 hl.import_bgen(path = 's3a://eui/imputed_files/' + bgen_file, sample_file = 's3a://sample_files/' + samp_file, entry_fields = ['dosage'])
<decorator-gen-1552> in import_bgen(path, entry_fields, sample_file, n_partitions, block_size, index_file_map, variants, _row_fields)
~/.local/lib/python3.8/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
582 def wrapper(__original_func, *args, **kwargs):
583 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 584 return __original_func(*args_, **kwargs_)
585
586 return wrapper
~/.local/lib/python3.8/site-packages/hail/methods/impex.py in import_bgen(path, entry_fields, sample_file, n_partitions, block_size, index_file_map, variants, _row_fields)
1293 reader = ir.MatrixBGENReader(path, sample_file, index_file_map, n_partitions, block_size, variants_path)
1294
-> 1295 mt = (MatrixTable(ir.MatrixRead(reader))
1296 .drop(*[fd for fd in ['GT', 'GP', 'dosage'] if fd not in entry_set],
1297 *[fd for fd in ['rsid', 'varid', 'offset', 'file_idx'] if fd not in row_set]))
~/.local/lib/python3.8/site-packages/hail/matrixtable.py in __init__(self, mir)
707 self._entry_indices = Indices(self, {self._row_axis, self._col_axis})
708
--> 709 self._type = self._mir.typ
710
711 self._global_type = self._type.global_type
~/.local/lib/python3.8/site-packages/hail/ir/base_ir.py in typ(self)
493 def typ(self):
494 if self._type is None:
--> 495 self.compute_type(deep_typecheck=False)
496 return self._type
497
~/.local/lib/python3.8/site-packages/hail/ir/base_ir.py in compute_type(self, deep_typecheck)
484 def compute_type(self, deep_typecheck):
485 if deep_typecheck or self._type is None:
--> 486 computed = self._compute_type(deep_typecheck)
487 if self._type is not None:
488 assert self._type == computed
~/.local/lib/python3.8/site-packages/hail/ir/matrix_ir.py in _compute_type(self, deep_typecheck)
183 def _compute_type(self, deep_typecheck):
184 if self._type is None:
--> 185 return Env.backend().matrix_type(self)
186 else:
187 return self._type
~/.local/lib/python3.8/site-packages/hail/backend/py4j_backend.py in matrix_type(self, mir)
182
183 def matrix_type(self, mir):
--> 184 jir = self._to_java_matrix_ir(mir)
185 return tmatrix._from_java(jir.typ())
186
~/.local/lib/python3.8/site-packages/hail/backend/py4j_backend.py in _to_java_matrix_ir(self, ir)
168
169 def _to_java_matrix_ir(self, ir):
--> 170 return self._to_java_ir(ir, self._parse_matrix_ir)
171
172 def _to_java_blockmatrix_ir(self, ir):
~/.local/lib/python3.8/site-packages/hail/backend/py4j_backend.py in _to_java_ir(self, ir, parse)
143 r = CSERenderer(stop_at_jir=True)
144 # FIXME parse should be static
--> 145 ir._jir = parse(r(finalize_randomness(ir)), ir_map=r.jirs)
146 return ir._jir
147
~/.local/lib/python3.8/site-packages/hail/backend/py4j_backend.py in _parse_matrix_ir(self, code, ir_map)
156
157 def _parse_matrix_ir(self, code, ir_map={}):
--> 158 return self._jbackend.parse_matrix_ir(code, ir_map)
159
160 def _parse_blockmatrix_ir(self, code, ir_map={}):
~/.local/lib/python3.8/site-packages/py4j/java_gateway.py in __call__(self, *args)
1319
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1323
~/.local/lib/python3.8/site-packages/hail/backend/py4j_backend.py in deco(*args, **kwargs)
33 tpl = Env.jutils().handleForPython(e.java_exception)
34 deepest, full, error_id = tpl._1(), tpl._2(), tpl._3()
---> 35 raise fatal_error_from_java_error_triplet(deepest, full, error_id) from None
36 except pyspark.sql.utils.CapturedException as e:
37 raise FatalError('%s\n\nJava stack trace:\n%s\n'
FatalError: AmazonS3Exception: Forbidden (Service: Amazon S3; Status Code: 403; Error Code: 403 Forbidden; Request ID: WVPHWWA5GT8TAQYM; S3 Extended Request ID: Wo0bslLuIq/fIFwHcvLlPRkVdcoYYc/IIH02HnIBrznDE/zk9cVgYDEysF8G9lrjxb3TtWApwf56JeyUPilxGuh/F40MCvNqfcXKsQOOSb4=; Proxy: null)
Java stack trace:
java.nio.file.AccessDeniedException: s3a://eui/imputed_files/ukb_imp_chrXY_v3.bgen: getFileStatus on s3a://eui/imputed_files/ukb_imp_chrXY_v3.bgen: com.amazonaws.services.s3.model.AmazonS3Exception: Forbidden (Service: Amazon S3; Status Code: 403; Error Code: 403 Forbidden; Request ID: WVPHWWA5GT8TAQYM; S3 Extended Request ID: Wo0bslLuIq/fIFwHcvLlPRkVdcoYYc/IIH02HnIBrznDE/zk9cVgYDEysF8G9lrjxb3TtWApwf56JeyUPilxGuh/F40MCvNqfcXKsQOOSb4=; Proxy: null), S3 Extended Request ID: Wo0bslLuIq/fIFwHcvLlPRkVdcoYYc/IIH02HnIBrznDE/zk9cVgYDEysF8G9lrjxb3TtWApwf56JeyUPilxGuh/F40MCvNqfcXKsQOOSb4=:403 Forbidden
at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:255)
at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:175)
at org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:3796)
at org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:3688)
at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$getFileStatus$24(S3AFileSystem.java:3556)
at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:499)
at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration(IOStatisticsBinding.java:444)
at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2337)
at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2356)
at org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:3554)
at org.apache.hadoop.fs.Globber.getFileStatus(Globber.java:115)
at org.apache.hadoop.fs.Globber.doGlob(Globber.java:349)
at org.apache.hadoop.fs.Globber.glob(Globber.java:202)
at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$globStatus$33(S3AFileSystem.java:4689)
at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:499)
at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration(IOStatisticsBinding.java:444)
at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2337)
at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2356)
at org.apache.hadoop.fs.s3a.S3AFileSystem.globStatus(S3AFileSystem.java:4682)
at org.apache.hadoop.fs.s3a.S3AFileSystem.globStatus(S3AFileSystem.java:4663)
at is.hail.io.fs.HadoopFS.glob(HadoopFS.scala:169)
at is.hail.io.bgen.LoadBgen$.$anonfun$getAllFileStatuses$1(LoadBgen.scala:155)
at is.hail.io.bgen.LoadBgen$.$anonfun$getAllFileStatuses$1$adapted(LoadBgen.scala:154)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:293)
at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:293)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:290)
at scala.collection.mutable.ArrayOps$ofRef.flatMap(ArrayOps.scala:198)
at is.hail.io.bgen.LoadBgen$.getAllFileStatuses(LoadBgen.scala:154)
at is.hail.io.bgen.LoadBgen$.getAllFilePaths(LoadBgen.scala:181)
at is.hail.io.bgen.MatrixBGENReader$.apply(LoadBgen.scala:339)
at is.hail.io.bgen.MatrixBGENReader$.fromJValue(LoadBgen.scala:322)
at is.hail.expr.ir.MatrixReader$.fromJson(MatrixIR.scala:87)
at is.hail.expr.ir.IRParser$.matrix_ir_1(Parser.scala:1874)
at is.hail.expr.ir.IRParser$.$anonfun$matrix_ir$1(Parser.scala:1790)
at is.hail.utils.StackSafe$More.advance(StackSafe.scala:64)
at is.hail.utils.StackSafe$.run(StackSafe.scala:16)
at is.hail.utils.StackSafe$StackFrame.run(StackSafe.scala:32)
at is.hail.expr.ir.IRParser$.$anonfun$parse_matrix_ir$1(Parser.scala:2153)
at is.hail.expr.ir.IRParser$.parse(Parser.scala:2138)
at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:2153)
at is.hail.backend.spark.SparkBackend.$anonfun$parse_matrix_ir$2(SparkBackend.scala:707)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:75)
at is.hail.utils.package$.using(package.scala:635)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:75)
at is.hail.utils.package$.using(package.scala:635)
at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:63)
at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:351)
at is.hail.backend.spark.SparkBackend.$anonfun$parse_matrix_ir$1(SparkBackend.scala:706)
at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
at is.hail.utils.ExecutionTimer$.logTime(ExecutionTimer.scala:59)
at is.hail.backend.spark.SparkBackend.parse_matrix_ir(SparkBackend.scala:705)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Unknown Source)
com.amazonaws.services.s3.model.AmazonS3Exception: Forbidden (Service: Amazon S3; Status Code: 403; Error Code: 403 Forbidden; Request ID: WVPHWWA5GT8TAQYM; S3 Extended Request ID: Wo0bslLuIq/fIFwHcvLlPRkVdcoYYc/IIH02HnIBrznDE/zk9cVgYDEysF8G9lrjxb3TtWApwf56JeyUPilxGuh/F40MCvNqfcXKsQOOSb4=; Proxy: null)
at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1862)
at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleServiceErrorResponse(AmazonHttpClient.java:1415)
at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1384)
at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1154)
at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:811)
at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:779)
at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:753)
at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:713)
at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:695)
at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:559)
at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:539)
at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5453)
at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5400)
at com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:1372)
at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$getObjectMetadata$10(S3AFileSystem.java:2545)
at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:414)
at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:377)
at org.apache.hadoop.fs.s3a.S3AFileSystem.getObjectMetadata(S3AFileSystem.java:2533)
at org.apache.hadoop.fs.s3a.S3AFileSystem.getObjectMetadata(S3AFileSystem.java:2513)
at org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:3776)
at org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:3688)
at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$getFileStatus$24(S3AFileSystem.java:3556)
at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:499)
at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration(IOStatisticsBinding.java:444)
at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2337)
at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2356)
at org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:3554)
at org.apache.hadoop.fs.Globber.getFileStatus(Globber.java:115)
at org.apache.hadoop.fs.Globber.doGlob(Globber.java:349)
at org.apache.hadoop.fs.Globber.glob(Globber.java:202)
at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$globStatus$33(S3AFileSystem.java:4689)
at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:499)
at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration(IOStatisticsBinding.java:444)
at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2337)
at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2356)
at org.apache.hadoop.fs.s3a.S3AFileSystem.globStatus(S3AFileSystem.java:4682)
at org.apache.hadoop.fs.s3a.S3AFileSystem.globStatus(S3AFileSystem.java:4663)
at is.hail.io.fs.HadoopFS.glob(HadoopFS.scala:169)
at is.hail.io.bgen.LoadBgen$.$anonfun$getAllFileStatuses$1(LoadBgen.scala:155)
at is.hail.io.bgen.LoadBgen$.$anonfun$getAllFileStatuses$1$adapted(LoadBgen.scala:154)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:293)
at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:293)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:290)
at scala.collection.mutable.ArrayOps$ofRef.flatMap(ArrayOps.scala:198)
at is.hail.io.bgen.LoadBgen$.getAllFileStatuses(LoadBgen.scala:154)
at is.hail.io.bgen.LoadBgen$.getAllFilePaths(LoadBgen.scala:181)
at is.hail.io.bgen.MatrixBGENReader$.apply(LoadBgen.scala:339)
at is.hail.io.bgen.MatrixBGENReader$.fromJValue(LoadBgen.scala:322)
at is.hail.expr.ir.MatrixReader$.fromJson(MatrixIR.scala:87)
at is.hail.expr.ir.IRParser$.matrix_ir_1(Parser.scala:1874)
at is.hail.expr.ir.IRParser$.$anonfun$matrix_ir$1(Parser.scala:1790)
at is.hail.utils.StackSafe$More.advance(StackSafe.scala:64)
at is.hail.utils.StackSafe$.run(StackSafe.scala:16)
at is.hail.utils.StackSafe$StackFrame.run(StackSafe.scala:32)
at is.hail.expr.ir.IRParser$.$anonfun$parse_matrix_ir$1(Parser.scala:2153)
at is.hail.expr.ir.IRParser$.parse(Parser.scala:2138)
at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:2153)
at is.hail.backend.spark.SparkBackend.$anonfun$parse_matrix_ir$2(SparkBackend.scala:707)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:75)
at is.hail.utils.package$.using(package.scala:635)
at is.hail.backend.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:75)
at is.hail.utils.package$.using(package.scala:635)
at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:63)
at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:351)
at is.hail.backend.spark.SparkBackend.$anonfun$parse_matrix_ir$1(SparkBackend.scala:706)
at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
at is.hail.utils.ExecutionTimer$.logTime(ExecutionTimer.scala:59)
at is.hail.backend.spark.SparkBackend.parse_matrix_ir(SparkBackend.scala:705)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Unknown Source)
Hail version: 0.2.116-cd64e0876c94
Error summary: AmazonS3Exception: Forbidden (Service: Amazon S3; Status Code: 403; Error Code: 403 Forbidden; Request ID: WVPHWWA5GT8TAQYM; S3 Extended Request ID: Wo0bslLuIq/fIFwHcvLlPRkVdcoYYc/IIH02HnIBrznDE/zk9cVgYDEysF8G9lrjxb3TtWApwf56JeyUPilxGuh/F40MCvNqfcXKsQOOSb4=; Proxy: null)
Given that the error mentions ‘Access Denied’ and I am trying to read files in private bucket, I figured the problem is in supplying the valid credentials. I’ve tried setting environment variables with access and security keys as described in the Hadoop manual here. But I still get the same error. I could not really understand how to implement other authentication methods.
I’ve also came across this issue and provided my keys similarly
sc = hl.spark_context()
sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", access_key)
sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", security_key)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", f'https://{endpoint_url}')
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3.S3FileSystem")
To no avail; the error is still the same.
Every other issue I find concern the use cases when people run hail on Amazon EMR. I am running it on the cluster of my university.
And, just in case, I can access the files using boto3
package or by running s3cmd
in the terminal. So, at least, I know the connection can be made.
I would appreciate any help on this issue. At this point, I don’t know what else to try.
Thank you!