I’m running into an out of memory error when trying to densify a MatrixTable:
[Stage 15:===> (2002 + 3006) / 30000]OpenJDK 64-Bit Server VM warning: INFO: os::commit_memory(0x00007fb132d80000, 4354736128, 0) failed; error='Cannot allocate memory' (errno=12)
#
# There is insufficient memory for the Java Runtime Environment to continue.
# Native memory allocation (mmap) failed to map 4354736128 bytes for committing reserved memory.
# An error report file with more information is saved as:
# /tmp/c98e624072f34b96a616efa158df27f5/hs_err_pid31285.log
Here is the code:
def main(args):
hl.init(log="/create_qc_data.log", default_reference="GRCh38")
data_source = "broad"
freeze = args.freeze
if args.compute_qc_mt:
logger.info("Reading in raw MT...")
mt = get_ukbb_data(
data_source,
freeze,
key_by_locus_and_alleles=True,
split=False,
raw=True,
repartition=args.repartition,
n_partitions=args.raw_partitions,
)
mt = mt.select_entries("LGT", "GQ", "DP", "LAD", "LA", "END")
logger.info("Reading in QC MT sites from tranche 2/freeze 5...")
if not hl.utils.hadoop_exists(f"{qc_sites_path()}/_SUCCESS"):
get_qc_mt_sites()
qc_sites_ht = hl.read_table(qc_sites_path())
logger.info(f"Number of QC sites: {qc_sites_ht.count()}")
logger.info("Densifying sites...")
last_END_ht = hl.read_table(last_END_positions_ht_path(freeze))
mt = densify_sites(mt, qc_sites_ht, last_END_ht)
logger.info("Checkpointing densified MT")
mt = mt.checkpoint(
get_checkpoint_path(
data_source, freeze, name="dense_qc_mt_v2_sites", mt=True
),
overwrite=True,
)
logger.info("Repartitioning densified MT")
mt = mt.naive_coalesce(args.n_partitions)
mt = mt.checkpoint(
get_checkpoint_path(
data_source, freeze, name="dense_qc_mt_v2_sites.repartitioned", mt=True,
),
overwrite=True,
)
# NOTE: Need MQ, QD, FS for hard filters
logger.info("Adding info and low QUAL annotations and filtering to adj...")
info_expr = get_site_info_expr(mt)
info_expr = info_expr.annotate(**get_as_info_expr(mt))
mt = mt.annotate_rows(info=info_expr)
mt = mt.annotate_entries(GT=hl.experimental.lgt_to_gt(mt.LGT, mt.LA))
mt = mt.select_entries("GT", adj=get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD))
mt = filter_to_adj(mt)
logger.info("Checkpointing MT...")
mt = mt.checkpoint(
get_checkpoint_path(
data_source,
freeze,
name=f"{data_source}.freeze_{freeze}.qc_sites.mt",
mt=True,
),
overwrite=True,
)
densify_sites
: https://github.com/broadinstitute/gnomad_methods/blob/master/gnomad/utils/sparse_mt.py#L62
I added two checkpoints to densify_sites
, one for sites_ht
and one for mt
. After the densify crashed, I re-ran the code using the checkpointed data and got the same error. The log doesn’t seem to want to attach, but I can send it via slack.
I have already run a densify on the same input MT successfully for another task. Could you help me figure out why this smaller densify on fewer rows is running out of memory (same cluster configuration)?