Hi hail team,
I’m running into a weird error while trying to filter the rows of a MatrixTable. I’m running:
hl.init(log="/apply_hard_filters.log", default_reference="GRCh38")
data_source = "broad"
freeze = args.freeze
if not file_exists(callrate_mt_path(data_source, freeze, interval_filtered=False)):
raise DataException("Need to compute interval callrate MT!")
logger.info("Reading in callrate MT, sex ht, interval qc HT...")
callrate_mt = hl.read_matrix_table(
callrate_mt_path(data_source, freeze, interval_filtered=False)
)
sex_ht = hl.read_table(sex_ht_path(data_source, freeze))
interval_qc_ht = hl.read_table(interval_qc_path(data_source, freeze, "autosomes"))
interval_qc_ht = interval_qc_ht.filter(
interval_qc_ht[args.cov_filter_field] > args.pct_samples
)
logger.info("Hard filtering samples...")
hard_filters_ht = hard_filter_samples(
data_source,
freeze,
callrate_mt,
interval_qc_ht,
sex_ht,
args.min_callrate,
args.min_dp,
)
and
def hard_filter_samples(
data_source: str,
freeze: int,
mt: hl.MatrixTable,
interval_qc_ht: hl.Table,
sex_ht: hl.Table,
min_callrate: float = 0.99,
min_depth: float = 20.0,
) -> hl.Table:
"""
Applies hard filters to samples and returns Table with samples and their hard filter status.
This function expects the input MT to be annotated with the fields n_defined, total, and dp_sum.
These are calculated using compute_interval_callrate_dp_mt.
:param str data_source: One of 'regeneron' or 'broad'
:param int freeze: One of the data freezes
:param MatrixTable mt: Input MatrixTable with samples to be filtered
:param Table interval_qc_ht: Table with high coverage intervals
:param Table sex_ht: Table with samples and their inferred sex
:param float min_callrate: Callrate threshold to be used to filter samples; default is 0.99
:param float min_depth: Mean depth threshold to be used to filter samples; default is 20.0
:return: Table with samples and their hard filter status
:rtype: hl.Table
"""
logger.info("Computing callrate and mean DP over high coverage intervals...")
mt = mt.filter_rows(hl.is_defined(interval_qc_ht[mt.row_key]))
mt = mt.checkpoint(callrate_mt_path(data_source, freeze, interval_filtered=True))
ht = mt.annotate_cols(
call_rate=hl.agg.sum(mt.n_defined) / hl.agg.sum(mt.total),
mean_dp=hl.agg.sum(mt.dp_sum) / hl.agg.sum(mt.total),
).cols()
logger.info("Adding sex imputation annotations...")
ht = ht.annotate(sex=sex_ht[ht.key].sex_karyotype)
ht = ht.checkpoint(
get_checkpoint_path(data_source, freeze, name="interval_qc_sample_qc"),
overwrite=True,
)
logger.info("Applying hard filters and writing out hard filters HT...")
ht = ht.annotate(
hard_filters=apply_hard_filters_expr(
ht.call_rate, ht.mean_dp, ht.sex, min_callrate, min_depth
)
)
ht = ht.annotate(
ht.hard_filters.annotate(
hard_filtered=(
(ht.hard_filters.low_callrate)
| (ht.hard_filters.ambiguous_sex)
| (ht.hard_filters.sex_aneuploidy)
| (ht.hard_filters.low_coverage)
)
)
)
return ht.drop("sex")
I seem to be getting a weird error on the filter rows:
[Stage 0:=> (598 + 301) / 30000]Traceback (most recent call last):
File "/tmp/f10f5b76f6964dea8a82b0aa025c9375/apply_hard_filters.py", line 213, in <module>
main(args)
File "/tmp/f10f5b76f6964dea8a82b0aa025c9375/apply_hard_filters.py", line 153, in main
args.min_dp,
File "/tmp/f10f5b76f6964dea8a82b0aa025c9375/apply_hard_filters.py", line 92, in hard_filter_samples
mt = mt.checkpoint(callrate_mt_path(data_source, freeze, interval_filtered=True), overwrite=True)
File "<decorator-gen-1090>", line 2, in checkpoint
File "/opt/conda/default/lib/python3.6/site-packages/hail/typecheck/check.py", line 585, in wrapper
return __original_func(*args_, **kwargs_)
File "/opt/conda/default/lib/python3.6/site-packages/hail/matrixtable.py", line 2490, in checkpoint
self.write(output=output, overwrite=overwrite, stage_locally=stage_locally, _codec_spec=_codec_spec)
File "<decorator-gen-1092>", line 2, in write
File "/opt/conda/default/lib/python3.6/site-packages/hail/typecheck/check.py", line 585, in wrapper
return __original_func(*args_, **kwargs_)
File "/opt/conda/default/lib/python3.6/site-packages/hail/matrixtable.py", line 2529, in write
Env.backend().execute(MatrixWrite(self._mir, writer))
File "/opt/conda/default/lib/python3.6/site-packages/hail/backend/backend.py", line 109, in execute
result = json.loads(Env.hc()._jhc.backend().executeJSON(self._to_java_ir(ir)))
File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/opt/conda/default/lib/python3.6/site-packages/hail/utils/java.py", line 225, in deco
'Error summary: %s' % (deepest, full, hail.__version__, deepest)) from None
hail.utils.java.FatalError: HailException: cannot set missing field for required type +PCStruct{pct_samples_20x:PFloat64}
Both the interval_qc_ht
and the mt
are keyed by interval. Can you help me decipher the cannot set missing field for required type +PCStruct{pct_samples_20x:PFloat64}
error?
Log: apply_hard_filters.log (2.6 MB)