Hello,
I am trying to use HAIL to quickly annotate VCF files using VEP/LOFTEE and filter for LOF variant carriers. My code works great up until I need to extract the Subject IDs from the MT and then I have faced a host of issues.
start = time.time()
# Step 2: Read VCF files
vcf_files = [
"file:///mnt/project/file1.vcf.gz",
"file:///mnt/project/file2.vcf.gz"
]
mt = hl.import_vcf(
vcf_files,
force_bgz=True,
reference_genome="GRCh38",
min_partitions=100
)
end = time.time()
print(f"Step 2 took {end - start:.4f} seconds")
# Step 3: Normalize
start = time.time()
mt = hl.split_multi_hts(mt)
end = time.time()
print(f"Step 3 took {end - start:.4f} seconds")
# Step 4: Annotate with VEP
start = time.time()
mt = hl.vep(mt, "file:///mnt/project/vep-GRCh38.json")
end = time.time()
print(f"Step 4 took {end - start:.4f} seconds")
# Step 5: Filter for LOF in target gene
start = time.time()
lof_consequences_hl = hl.set([
"transcript_ablation", "splice_acceptor_variant", "splice_donor_variant",
"stop_gained", "frameshift_variant", "start_lost", "stop_lost"
])
mt = mt.filter_rows(
hl.any(lambda csq: (
(csq.gene_symbol == "PCSK9") &
hl.any(lambda term: lof_consequences_hl.contains(term), csq.consequence_terms)
), mt.vep.transcript_consequences)
)
end = time.time()
print(f"Step 5 took {end - start:.4f} seconds")
# Step 6: Annotate each sample with a flag if they carry any non-ref genotype
start = time.time()
mt = mt.annotate_cols(
is_lof_carrier = hl.agg.any(mt.GT.is_non_ref())
)
end = time.time()
print(f"Step 6 took {end - start:.4f} seconds")
# Step 7: Collect the sample IDs of carriers
start = time.time()
lof_mt = mt.filter_cols(mt.is_lof_carrier)
# This doesn't work lol (seems to run forever)
carrier_ids = lof_mt.s.collect()
I’ve tried a bunch of .collect(). .select() but I can’t seem to figure this out. I assume I am missing something blatantly obvious but I’ve been stuck on this last part for a while.
Any assistance would be greatly appreciated!