Hi Hail team,
I previously have been able to subset very small vcfs around 55 samples, but not have had to do any subsetting at a large scale.
I am currently trying to use hail to subset large ~40k sample vcfs (by chromosome) into 48 subsetted vcfs. This would end up being 22 (chromosomes) * 48 (subsetted lists) = 1,056 subsetted vcfs.
Our team was never able to get a docker image up where we could run hail correctly in the cloud, so we are attempting to do this on prem using Uger. When attempting to subset just one of these vcfs the job ran for over one day and ended up just stopping without completion. Below is the script used to do the subsetting. Do you have any suggestions for how this could be completed?
#!/usr/bin/env python
import hail as hl
import sys
import os
import argparse
import logging
logging.basicConfig(
format='%(levelname)s: %(asctime)s : %(message)s', level=logging.INFO
)
def parse_command_line():
"""Fail or return arguments parsed from the command line."""
argparser = argparse.ArgumentParser(description=__doc__)
argparser.add_argument('-i', '--input_vcf', required=True)
argparser.add_argument('-t', '--tsv', required=True)
argparser.add_argument('-c', '--chromosome', required=True)
return argparser.parse_args()
def subset_vcf(mt, subset_list, subset_vcf_path):
logging.info(f'subsetting {subset_vcf_path} from vcf')
with open(subset_list) as f:
# column one is the samples
samples = [line.split('\t')[0] for line in f]
mt = mt.filter_cols(hl.literal(samples).contains(mt.s), keep=True)
# write out a new VCF
logging.info(f'writting out {subset_vcf_path}')
hl.export_vcf(mt, subset_vcf_path)
def create_directory(vcf_name):
vcf_path = os.path.join('/seq/tng/newCloudJointGenotyping/PO-23593', vcf_name)
if not os.path.exists(vcf_path):
os.makedirs(vcf_path)
return vcf_path
if __name__ == "__main__":
args = parse_command_line()
hl.init(default_reference='GRCh38')
# read in the vcf. Replace nul with .
logging.info('Loading in vcf')
mt = hl.import_vcf(args.input_vcf, force=True, find_replace=('nul', '.'))
with open(args.tsv) as f:
for line in f:
vcf_name = line.rstrip()
subset_list = os.path.join(
'/seq/tng/newCloudJointGenotyping/PO-23593/sampleLists',
line.rstrip()
)
directory = create_directory(vcf_name)
vcf_full_name = f'{vcf_name}.chr{args.chromosome}.vcf.bgz'
vcf_path = os.path.join(directory, vcf_full_name)
subset_vcf(mt, subset_list, vcf_path)
Thanks,
Sam