Error 403 Forbidden when I try to load the hail experimental dataset '1000_Genomes_autosomes'

Hello!

I am running the following command:
mt1kg = hl.experimental.load_dataset(name='1000_Genomes_autosomes', version='phase_3', reference_genome='GRCh37')

And I get the error found at the bottom. I am running this in a notebook on the google cloud. I am signed into my broad account and am successfully using other Hail commands.
Thank you so much for your help!

---------------------------------------------------------------------------
FatalError                                Traceback (most recent call last)
<ipython-input-11-53d86367c321> in <module>
      1 # QC 1KG MT:
----> 2 mt1kg = hl.experimental.load_dataset(name='1000_Genomes_autosomes', version='phase_3' ,reference_genome='GRCh37')

/opt/conda/miniconda3/lib/python3.6/site-packages/hail/experimental/datasets.py in load_dataset(name, version, reference_genome, config_file)
     31     :class:`.Table` or :class:`.MatrixTable`"""
     32 
---> 33     with hl.hadoop_open(config_file, 'r') as f:
     34         datasets = json.load(f)
     35 

<decorator-gen-131> in hadoop_open(path, mode, buffer_size)

/opt/conda/miniconda3/lib/python3.6/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
    612     def wrapper(__original_func, *args, **kwargs):
    613         args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 614         return __original_func(*args_, **kwargs_)
    615 
    616     return wrapper

/opt/conda/miniconda3/lib/python3.6/site-packages/hail/utils/hadoop_utils.py in hadoop_open(path, mode, buffer_size)
     77         Readable or writable file handle.
     78     """
---> 79     return Env.fs().open(path, mode, buffer_size)
     80 
     81 

/opt/conda/miniconda3/lib/python3.6/site-packages/hail/fs/hadoop_fs.py in open(self, path, mode, buffer_size)
     13     def open(self, path: str, mode: str = 'r', buffer_size: int = 8192):
     14         if 'r' in mode:
---> 15             handle = io.BufferedReader(HadoopReader(self, path, buffer_size), buffer_size=buffer_size)
     16         elif 'w' in mode:
     17             handle = io.BufferedWriter(HadoopWriter(self, path), buffer_size=buffer_size)

/opt/conda/miniconda3/lib/python3.6/site-packages/hail/fs/hadoop_fs.py in __init__(self, hfs, path, buffer_size)
     49     def __init__(self, hfs, path, buffer_size):
     50         super(HadoopReader, self).__init__()
---> 51         self._jfile = hfs._utils_package_object.readFile(hfs._jfs, path, buffer_size)
     52 
     53     def close(self):

/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1255         answer = self.gateway_client.send_command(command)
   1256         return_value = get_return_value(
-> 1257             answer, self.gateway_client, self.target_id, self.name)
   1258 
   1259         for temp_arg in temp_args:

/opt/conda/miniconda3/lib/python3.6/site-packages/hail/backend/spark_backend.py in deco(*args, **kwargs)
     39             raise FatalError('%s\n\nJava stack trace:\n%s\n'
     40                              'Hail version: %s\n'
---> 41                              'Error summary: %s' % (deepest, full, hail.__version__, deepest)) from None
     42         except pyspark.sql.utils.CapturedException as e:
     43             raise FatalError('%s\n\nJava stack trace:\n%s\n'

FatalError: GoogleJsonResponseException: 403 Forbidden
{
  "code" : 403,
  "errors" : [ {
    "domain" : "global",
    "message" : "408727143846-compute@developer.gserviceaccount.com does not have storage.objects.get access to the Google Cloud Storage object.",
    "reason" : "forbidden"
  } ],
  "message" : "408727143846-compute@developer.gserviceaccount.com does not have storage.objects.get access to the Google Cloud Storage object."
}

Java stack trace:
java.io.IOException: Error accessing gs://hail-datasets/datasets.json
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageImpl.getObject(GoogleCloudStorageImpl.java:1945)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageImpl.getItemInfo(GoogleCloudStorageImpl.java:1851)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageImpl.open(GoogleCloudStorageImpl.java:629)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem.open(GoogleCloudStorageFileSystem.java:322)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFSInputStream.<init>(GoogleHadoopFSInputStream.java:77)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.open(GoogleHadoopFileSystemBase.java:740)
	at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:914)
	at is.hail.io.fs.HadoopFS.openNoCompression(HadoopFS.scala:83)
	at is.hail.io.fs.FS$class.open(FS.scala:139)
	at is.hail.io.fs.HadoopFS.open(HadoopFS.scala:70)
	at is.hail.io.fs.FS$class.open(FS.scala:148)
	at is.hail.io.fs.HadoopFS.open(HadoopFS.scala:70)
	at is.hail.utils.Py4jUtils$class.readFile(Py4jUtils.scala:105)
	at is.hail.utils.package$.readFile(package.scala:77)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)

com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.json.GoogleJsonResponseException: 403 Forbidden
{
  "code" : 403,
  "errors" : [ {
    "domain" : "global",
    "message" : "408727143846-compute@developer.gserviceaccount.com does not have storage.objects.get access to the Google Cloud Storage object.",
    "reason" : "forbidden"
  } ],
  "message" : "408727143846-compute@developer.gserviceaccount.com does not have storage.objects.get access to the Google Cloud Storage object."
}
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.json.GoogleJsonResponseException.from(GoogleJsonResponseException.java:150)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.json.AbstractGoogleJsonClientRequest.newExceptionOnError(AbstractGoogleJsonClientRequest.java:113)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.json.AbstractGoogleJsonClientRequest.newExceptionOnError(AbstractGoogleJsonClientRequest.java:40)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.AbstractGoogleClientRequest$1.interceptResponse(AbstractGoogleClientRequest.java:401)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.http.HttpRequest.execute(HttpRequest.java:1097)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.AbstractGoogleClientRequest.executeUnparsed(AbstractGoogleClientRequest.java:499)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.AbstractGoogleClientRequest.executeUnparsed(AbstractGoogleClientRequest.java:432)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.AbstractGoogleClientRequest.execute(AbstractGoogleClientRequest.java:549)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageImpl.getObject(GoogleCloudStorageImpl.java:1939)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageImpl.getItemInfo(GoogleCloudStorageImpl.java:1851)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageImpl.open(GoogleCloudStorageImpl.java:629)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem.open(GoogleCloudStorageFileSystem.java:322)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFSInputStream.<init>(GoogleHadoopFSInputStream.java:77)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.open(GoogleHadoopFileSystemBase.java:740)
	at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:914)
	at is.hail.io.fs.HadoopFS.openNoCompression(HadoopFS.scala:83)
	at is.hail.io.fs.FS$class.open(FS.scala:139)
	at is.hail.io.fs.HadoopFS.open(HadoopFS.scala:70)
	at is.hail.io.fs.FS$class.open(FS.scala:148)
	at is.hail.io.fs.HadoopFS.open(HadoopFS.scala:70)
	at is.hail.utils.Py4jUtils$class.readFile(Py4jUtils.scala:105)
	at is.hail.utils.package$.readFile(package.scala:77)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)




Hail version: 0.2.55-0d4ce0df2457
Error summary: GoogleJsonResponseException: 403 Forbidden
{
  "code" : 403,
  "errors" : [ {
    "domain" : "global",
    "message" : "408727143846-compute@developer.gserviceaccount.com does not have storage.objects.get access to the Google Cloud Storage object.",
    "reason" : "forbidden"
  } ],
  "message" : "408727143846-compute@developer.gserviceaccount.com does not have storage.objects.get access to the Google Cloud Storage object."
}

We had to move these files to a requester-pays bucket (user pays egress charges if downloading from Google Cloud, free for use on US regions of Google Cloud).

If you update to the latest Hail version, this should work.

Thanks!
I updated to the latest hail version on my laptop using:
pip3 install -U hail

Then I created a new hail dataproc and notebook, but I am still getting the same error. Did I update Hail correctly?

What’s the output of hailctl --version?

It says that hailctl --version isn’t a command, but the output of hailctl version is:
0.2.55-0d4ce0df2457

er yeah, that. OK, that’s still the older version. How about which hailctl?

Maybe you have two python distributions

which hailctl outputs /Users/lilllianpetersen/miniconda3/bin/hailctl

can you do which pip and which pip3?

something you might try is:

pip uninstall -y hail
pip3 uninstall -y hail
pip3 install hail
which pip
/Users/lilllianpetersen/miniconda3/bin/pip

which pip3
/usr/local/bin/pip3

Yeah, you’ve totally got two python distributions. I think uninstalling from both and reinstalling just in pip3 should work.

Er, John pointed out that you should really be just using miniconda here. So uninstall both, and then just pip install hail.

Then which hailctl should still point to something with a /Users/.../miniconda3 prefix.

Thanks for your help! I uninstalled all of the things and then reinstalled hail. Now I am getting a different error on the same line of code;

---------------------------------------------------------------------------
FatalError                                Traceback (most recent call last)
<ipython-input-5-53d86367c321> in <module>
      1 # QC 1KG MT:
----> 2 mt1kg = hl.experimental.load_dataset(name='1000_Genomes_autosomes', version='phase_3' ,reference_genome='GRCh37')

/opt/conda/miniconda3/lib/python3.6/site-packages/hail/experimental/datasets.py in load_dataset(name, version, reference_genome, region, cloud)
    105         if not path.endswith('.mt'):
    106             raise ValueError('Invalid path {}: can only load datasets with .ht or .mt extensions.'.format(repr(path)))
--> 107         dataset = hl.read_matrix_table(path)
    108 
    109     return dataset

<decorator-gen-1429> in read_matrix_table(path, _intervals, _filter_intervals, _drop_cols, _drop_rows)

/opt/conda/miniconda3/lib/python3.6/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
    612     def wrapper(__original_func, *args, **kwargs):
    613         args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 614         return __original_func(*args_, **kwargs_)
    615 
    616     return wrapper

/opt/conda/miniconda3/lib/python3.6/site-packages/hail/methods/impex.py in read_matrix_table(path, _intervals, _filter_intervals, _drop_cols, _drop_rows)
   1996     :class:`.MatrixTable`
   1997     """
-> 1998     for rg_config in Env.backend().load_references_from_dataset(path):
   1999         hl.ReferenceGenome._from_config(rg_config)
   2000 

/opt/conda/miniconda3/lib/python3.6/site-packages/hail/backend/spark_backend.py in load_references_from_dataset(self, path)
    320 
    321     def load_references_from_dataset(self, path):
--> 322         return json.loads(Env.hail().variant.ReferenceGenome.fromHailDataset(self.fs._jfs, path))
    323 
    324     def from_fasta_file(self, name, fasta_file, index_file, x_contigs, y_contigs, mt_contigs, par):

/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1255         answer = self.gateway_client.send_command(command)
   1256         return_value = get_return_value(
-> 1257             answer, self.gateway_client, self.target_id, self.name)
   1258 
   1259         for temp_arg in temp_args:

/opt/conda/miniconda3/lib/python3.6/site-packages/hail/backend/py4j_backend.py in deco(*args, **kwargs)
     30                 raise FatalError('%s\n\nJava stack trace:\n%s\n'
     31                                  'Hail version: %s\n'
---> 32                                  'Error summary: %s' % (deepest, full, hail.__version__, deepest), error_id) from None
     33         except pyspark.sql.utils.CapturedException as e:
     34             raise FatalError('%s\n\nJava stack trace:\n%s\n'

FatalError: GoogleJsonResponseException: 400 Bad Request
{
  "code" : 400,
  "errors" : [ {
    "domain" : "global",
    "message" : "Bucket is requester pays bucket but no user project provided.",
    "reason" : "required"
  } ],
  "message" : "Bucket is requester pays bucket but no user project provided."
}

Java stack trace:
java.io.IOException: Error accessing gs://hail-datasets-us/1000_Genomes_autosomes.phase_3.GRCh37.mt
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageImpl.getObject(GoogleCloudStorageImpl.java:1945)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageImpl.getItemInfo(GoogleCloudStorageImpl.java:1851)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem.getFileInfoInternal(GoogleCloudStorageFileSystem.java:1148)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem.getFileInfo(GoogleCloudStorageFileSystem.java:1116)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.getFileStatus(GoogleHadoopFileSystemBase.java:1121)
	at is.hail.io.fs.HadoopFS.fileStatus(HadoopFS.scala:151)
	at is.hail.io.fs.FS$class.isDir(FS.scala:172)
	at is.hail.io.fs.HadoopFS.isDir(HadoopFS.scala:70)
	at is.hail.expr.ir.RelationalSpec$.readMetadata(AbstractMatrixTableSpec.scala:31)
	at is.hail.expr.ir.RelationalSpec$.readReferences(AbstractMatrixTableSpec.scala:66)
	at is.hail.variant.ReferenceGenome$.fromHailDataset(ReferenceGenome.scala:596)
	at is.hail.variant.ReferenceGenome.fromHailDataset(ReferenceGenome.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)

com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.json.GoogleJsonResponseException: 400 Bad Request
{
  "code" : 400,
  "errors" : [ {
    "domain" : "global",
    "message" : "Bucket is requester pays bucket but no user project provided.",
    "reason" : "required"
  } ],
  "message" : "Bucket is requester pays bucket but no user project provided."
}
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.json.GoogleJsonResponseException.from(GoogleJsonResponseException.java:150)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.json.AbstractGoogleJsonClientRequest.newExceptionOnError(AbstractGoogleJsonClientRequest.java:113)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.json.AbstractGoogleJsonClientRequest.newExceptionOnError(AbstractGoogleJsonClientRequest.java:40)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.AbstractGoogleClientRequest$1.interceptResponse(AbstractGoogleClientRequest.java:401)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.http.HttpRequest.execute(HttpRequest.java:1097)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.AbstractGoogleClientRequest.executeUnparsed(AbstractGoogleClientRequest.java:499)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.AbstractGoogleClientRequest.executeUnparsed(AbstractGoogleClientRequest.java:432)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.AbstractGoogleClientRequest.execute(AbstractGoogleClientRequest.java:549)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageImpl.getObject(GoogleCloudStorageImpl.java:1939)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageImpl.getItemInfo(GoogleCloudStorageImpl.java:1851)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem.getFileInfoInternal(GoogleCloudStorageFileSystem.java:1148)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem.getFileInfo(GoogleCloudStorageFileSystem.java:1116)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.getFileStatus(GoogleHadoopFileSystemBase.java:1121)
	at is.hail.io.fs.HadoopFS.fileStatus(HadoopFS.scala:151)
	at is.hail.io.fs.FS$class.isDir(FS.scala:172)
	at is.hail.io.fs.HadoopFS.isDir(HadoopFS.scala:70)
	at is.hail.expr.ir.RelationalSpec$.readMetadata(AbstractMatrixTableSpec.scala:31)
	at is.hail.expr.ir.RelationalSpec$.readReferences(AbstractMatrixTableSpec.scala:66)
	at is.hail.variant.ReferenceGenome$.fromHailDataset(ReferenceGenome.scala:596)
	at is.hail.variant.ReferenceGenome.fromHailDataset(ReferenceGenome.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)




Hail version: 0.2.59-63cf625e29e5
Error summary: GoogleJsonResponseException: 400 Bad Request
{
  "code" : 400,
  "errors" : [ {
    "domain" : "global",
    "message" : "Bucket is requester pays bucket but no user project provided.",
    "reason" : "required"
  } ],
  "message" : "Bucket is requester pays bucket but no user project provided."
}

The above link describes requester pays a bit. Basically, Google charges money for moving data between continents (egress charges), and there’s a question of who should pay these charges. By default the owner of the bucket pays. When it’s a “requester-pays” bucket the downloader pays. You have to acknowledge that you’re willing to pay these fees when you start the cluster by specifying a special argument. In your case, you can add --requester-pays-allow-annotation-db. You won’t actually get any charges because you’re in the US and so is the data, but you still have to do the acknowledgement.

Awesome! Got it working!