Hello!
I am running into an odd issue with attaching GRCh38 fasta and fasta.fai files to a ReferenceGenome object. For context, this is running on UK Biobank RAP in Jupyter Lab with a Spark cluster. I am not sure if this is being caused by how I am configuring my Spark cluster, storing my reference files, or if it is a backend issue with DNA Nexus.
Here is a minimal example that causes the problem:
from pyspark.sql import SparkSession
import hail as hl
import dxpy
builder = SparkSession.builder.enableHiveSupport()
spark = builder.getOrCreate()
db_uri = dxpy.find_one_data_object(
name = "my_db",
classname = "database"
)["id"]
fasta_path = f"dnax://{db_uri}/references/Homo_sapiens_assembly38.fasta.gz"
fai_path = f"dnax://{db_uri}/references/Homo_sapiens_assembly38.fasta.fai”
reference_genome = hl.get_reference('GRCh38')
reference_genome.add_sequence(
fasta_file = fasta_path,
index_file = fai_path
)
# simple test
test = hl.eval(
hl.get_sequence("chr1", 1000000, 0, 10, "GRCh38")
)
print("Simple test:", test)
# annotating ht
ht = hl.utils.range_table(10)
ht = ht.annotate(
seq = hl.get_sequence("chr1", 1000000 + ht.idx, 0, 10, "GRCh38")
)
ht.show()
The simple test returns “GGTGGAGCGCG” as expected, and reference_genome.has_sequence() confirms that the files were attached. However, annotating a table (which is what I would eventually like to do) throws the following error:
---------------------------------------------------------------------------
FatalError Traceback (most recent call last)
File /opt/conda/lib/python3.12/site-packages/IPython/core/formatters.py:708, in PlainTextFormatter._call_(self, obj)
701 stream = StringIO()
702 printer = pretty.RepresentationPrinter(stream, self.verbose,
703 self.max_width, self.newline,
704 max_seq_length=self.max_seq_length,
705 singleton_pprinters=self.singleton_printers,
706 type_pprinters=self.type_printers,
707 deferred_pprinters=self.deferred_printers)
→ 708 printer.pretty(obj)
709 printer.flush()
710 return stream.getvalue()
File /opt/conda/lib/python3.12/site-packages/IPython/lib/pretty.py:410, in RepresentationPrinter.pretty(self, obj)
407 return meth(obj, self, cycle)
408 if cls is not object \
409 and callable(cls._dict_.get(‘_repr_’)):
→ 410 return _repr_pprint(obj, self, cycle)
412 return _default_pprint(obj, self, cycle)
413 finally:
File /opt/conda/lib/python3.12/site-packages/IPython/lib/pretty.py:778, in _repr_pprint(obj, p, cycle)
776 “”“A pprint that just redirects to the normal repr function.”“”
777 # Find newlines and replace them with p.break_()
→ 778 output = repr(obj)
779 lines = output.splitlines()
780 with p.group():
File /opt/conda/lib/python3.12/site-packages/hail/table.py:2160, in Table._Show._repr_(self)
2159 def _repr_(self):
→ 2160 return self._str_()
File /opt/conda/lib/python3.12/site-packages/hail/table.py:2157, in Table._Show._str_(self)
2156 def _str_(self):
→ 2157 return self._ascii_str()
File /opt/conda/lib/python3.12/site-packages/hail/table.py:2183, in Table._Show._ascii_str(self)
2180 return s[: truncate - 3] + “…”
2181 return s
→ 2183 rows, has_more, dtype = self.data()
2184 fields = list(dtype)
2185 trunc_fields = [trunc(f) for f in fields]
File /opt/conda/lib/python3.12/site-packages/hail/table.py:2167, in Table._Show.data(self)
2165 row_dtype = t.row.dtype
2166 t = t.select(**{k: hl._showstr(v) for (k, v) in t.row.items()})
→ 2167 rows, has_more = t._take_n(self.n)
2168 self._data = (rows, has_more, row_dtype)
2169 return self._data
File /opt/conda/lib/python3.12/site-packages/hail/table.py:2313, in Table._take_n(self, n)
2311 has_more = False
2312 else:
→ 2313 rows = self.take(n + 1)
2314 has_more = len(rows) > n
2315 rows = rows[:n]
File :2, in take(self, n, _localize)
File /opt/conda/lib/python3.12/site-packages/hail/typecheck/check.py:585, in _make_dec..wrapper(__original_func, *args, **kwargs)
582 @decorator
583 def wrapper(__original_func: Callable[…, T], *args, **kwargs) → T:
584 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
→ 585 return __original_func(*args_, **kwargs_)
File /opt/conda/lib/python3.12/site-packages/hail/table.py:3030, in Table.take(self, n, _localize)
2996 @typecheck_method(n=int, _localize=bool)
2997 def take(self, n, _localize=True):
2998 “”“Collect the first `n` rows of the table into a local list.
2999
3000 Examples
(…)
3027 List of row structs.
3028 “””
→ 3030 return self.head(n).collect(_localize)
File :2, in collect(self, _localize, _timed)
File /opt/conda/lib/python3.12/site-packages/hail/typecheck/check.py:585, in _make_dec..wrapper(__original_func, *args, **kwargs)
582 @decorator
583 def wrapper(__original_func: Callable[…, T], *args, **kwargs) → T:
584 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
→ 585 return __original_func(*args_, **kwargs_)
File /opt/conda/lib/python3.12/site-packages/hail/table.py:2817, in Table.collect(self, _localize, _timed)
2815 e = construct_expr(rows_ir, hl.tarray(t.row.dtype))
2816 if _localize:
→ 2817 return Env.backend().execute(e._ir, timed=_timed)
2818 else:
2819 return e
File /opt/conda/lib/python3.12/site-packages/hail/backend/spark_backend.py:217, in SparkBackend.execute(self, ir, timed)
214 except Exception as fatal:
215 raise err from fatal
→ 217 raise err
File /opt/conda/lib/python3.12/site-packages/hail/backend/spark_backend.py:209, in SparkBackend.execute(self, ir, timed)
207 def execute(self, ir: BaseIR, timed: bool = False) → Any:
208 try:
→ 209 return super().execute(ir, timed)
210 except Exception as err:
211 if self._copy_log_on_error:
File /opt/conda/lib/python3.12/site-packages/hail/backend/backend.py:181, in Backend.execute(self, ir, timed)
179 result, timings = self._rpc(ActionTag.EXECUTE, payload)
180 except FatalError as e:
→ 181 raise e.maybe_user_error(ir) from None
182 if ir.typ == tvoid:
183 value = None
File /opt/conda/lib/python3.12/site-packages/hail/backend/backend.py:179, in Backend.execute(self, ir, timed)
177 payload = ExecutePayload(self._render_ir(ir), ‘{“name”:“StreamBufferSpec”}’, timed)
178 try:
→ 179 result, timings = self._rpc(ActionTag.EXECUTE, payload)
180 except FatalError as e:
181 raise e.maybe_user_error(ir) from None
File /opt/conda/lib/python3.12/site-packages/hail/backend/py4j_backend.py:221, in Py4JBackend._rpc(self, action, payload)
219 if resp.status_code >= 400:
220 error_json = orjson.loads(resp.content)
→ 221 raise fatal_error_from_java_error_triplet(
222 error_json[‘short’], error_json[‘expanded’], error_json[‘error_id’]
223 )
224 return resp.content, resp.headers.get(‘X-Hail-Timings’, ‘’)
FatalError: NoSuchFileException: /tmp/fasta-reader-HbsWHzgHj6K6F2FsK2PV3n.fasta
I have also tried a few other approaches to attaching the reference sequence files:
- Using !wget to download the reference files to the Jupyter notebook working directory
- Referring to the files from publicly available cloud storage using either s3:// or s3a//, so:
reference_genome.add_sequence(
fasta_file = "s3a://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta”
index_file = "s3a://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai”
)
In both cases, adding the sequence is initially successful and the simple test runs fine. But annotating the Hail table throws the same error. This code runs fine on the All of Us researcher workbench, which is what made me suspect that it could either be an issue with how I am configuring Spark or a backend issue with DNA Nexus.
Any advice would be appreciated!
John