Hi,
I was using Hail to perform some variant QC for summary statistics download from UKBB in the AllofUs platform. I was able to run the pipeline but failed to export the QC-ed summary stats as a bgz file. Here is a sketch of my pipeline:
var_lst = hl.read_table("varlst_filename_in")
sumstats = hl.read_table("sumstats_filename_in")
var_lst = var_lst.annotate(**sumstats[var_lst.locus])
var_lst = var_lst.filter(xxx)
sumstats_qced = var_lst.select(col1 = var_lst.col1,
col2 = var_lst.col2
)
sumstats_qced.show() # this works
sumstats_qced.count() # this works, 8,604,715 rows
sumstats_qced.export("sumstats_qced.tsv.bgz") # this doesn't work, see error message
Here are the error messages:
ERROR:root:Exception while sending command.=============>(140094 + 32) / 140126]
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/py4j/java_gateway.py", line 1224, in send_command
raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/py4j/java_gateway.py", line 1038, in send_command
response = connection.send_command(command)
File "/opt/conda/lib/python3.7/site-packages/py4j/java_gateway.py", line 1229, in send_command
"Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
---------------------------------------------------------------------------
Py4JError Traceback (most recent call last)
/tmp/ipykernel_1091/2060613055.py in <module>
4 sumstats_QC_quant(f"{bucket}/Sumstats/continuous-50-both_sexes-irnt_checkpoint.ht",
5 f"{bucket}/Sumstats/WGS_Height_QCed.tsv.bgz",
----> 6 var_wgs)
7
8 print("DBP " + str(datetime.now()))
/tmp/ipykernel_1091/205199695.py in sumstats_QC_quant(ht_filename_in, filename_out, var_wgs)
41 neglog10_pval_meta = var_wgs.neglog10_pval_meta_hq)
42
---> 43 sumstats_QCed.export(filename_out)
<decorator-gen-1190> in export(self, output, types_file, header, parallel, delimiter)
/opt/conda/lib/python3.7/site-packages/hail/typecheck/check.py in wrapper(__original_func, *args, **kwargs)
575 def wrapper(__original_func, *args, **kwargs):
576 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 577 return __original_func(*args_, **kwargs_)
578
579 return wrapper
/opt/conda/lib/python3.7/site-packages/hail/table.py in export(self, output, types_file, header, parallel, delimiter)
1097 parallel = ir.ExportType.default(parallel)
1098 Env.backend().execute(
-> 1099 ir.TableWrite(self._tir, ir.TableTextWriter(output, types_file, header, parallel, delimiter)))
1100
1101 def group_by(self, *exprs, **named_exprs) -> 'GroupedTable':
/opt/conda/lib/python3.7/site-packages/hail/backend/py4j_backend.py in execute(self, ir, timed)
97 # print(self._hail_package.expr.ir.Pretty.apply(jir, True, -1))
98 try:
---> 99 result_tuple = self._jbackend.executeEncode(jir, stream_codec, timed)
100 (result, timings) = (result_tuple._1(), result_tuple._2())
101 value = ir.typ._from_encoding(result)
/opt/conda/lib/python3.7/site-packages/py4j/java_gateway.py in __call__(self, *args)
1321 answer = self.gateway_client.send_command(command)
1322 return_value = get_return_value(
-> 1323 answer, self.gateway_client, self.target_id, self.name)
1324
1325 for temp_arg in temp_args:
/opt/conda/lib/python3.7/site-packages/hail/backend/py4j_backend.py in deco(*args, **kwargs)
19 import pyspark
20 try:
---> 21 return f(*args, **kwargs)
22 except py4j.protocol.Py4JJavaError as e:
23 s = e.java_exception.toString()
/opt/conda/lib/python3.7/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
334 raise Py4JError(
335 "An error occurred while calling {0}{1}{2}".
--> 336 format(target_id, ".", name))
337 else:
338 type = answer[1]
Py4JError: An error occurred while calling o1.executeEncode