Inbreeding Calculation Run Time

Hello,

Our group is in the process of scaling up our hail sample QC script to process the nearly 500k WES samples on the UK Biobank Research Analysis Platform. This component of our script that generates inbreeding stats, and generates a summary .tsv file for these and prior calculated stats takes an overwhelming portion of the run time.

mt = hl.variant_qc(mt, 'qc')

chrX_good_variants= mt.filter_rows(
    (
        # Only keep ChrX non-monomorphic variants with all filters passed
        # Used for heterozygosity sex check
        # (vcf.filter_pass==True) & (vcf.locus.in_x_nonpar())
        (hl.len(mt.filters) == 0) & (mt.locus.in_x_nonpar())
        & (mt.qc.AC[0] >= 1) & (mt.qc.AC[1] >= 1)
    ),
    keep=True
)
summary.append(('SexCheck', chrX_good_variants))
print("Exporting heterozygosity/inbreeding statistics (Chr X)")

sample_df = sample_df.join(
    chrX_good_variants.annotate_cols(
        inbreeding=hl.agg.inbreeding(chrX_good_variants.GT, chrX_good_variants.qc.AF[1])
    ).cols().to_pandas()[[
        's', 'inbreeding.f_stat', 'inbreeding.n_called',
        'inbreeding.expected_homs', 'inbreeding.observed_homs'
    ]].rename(
        {
            's': 'sample_id',
            'inbreeding.f_stat': 'inbreedingFStatX',
            'inbreeding.n_called': 'inbreedingCallsX',
            'inbreeding.expected_homs': 'inbreedingExpectedHomX',
            'inbreeding.observed_homs': 'inbreedingObservedHomX'
        },
        axis='columns'
    ).set_index('sample_id'),
    how='left'
)

# Also add y nonpar heterozygosity columns

chrY_good_variants= mt.filter_rows(
    (
        # Only keep Chry non-monomorphic variants with all filters passed
        # Used for heterozygosity sex check
        (hl.len(mt.filters) == 0) & (mt.locus.in_y_nonpar())
        & (mt.qc.AC[0] >= 1) & (mt.qc.AC[1] >= 1)
    ),
    keep=True
)
summary.append(('SexCheckY', chrY_good_variants))
print("Exporting heterozygosity/inbreeding statistics (Chr Y)")

sample_df = sample_df.join(
    chrY_good_variants.annotate_cols(
        inbreeding=hl.agg.inbreeding(chrY_good_variants.GT, chrY_good_variants.qc.AF[1])
    ).cols().to_pandas()[[
        's', 'inbreeding.f_stat', 'inbreeding.n_called',
        'inbreeding.expected_homs', 'inbreeding.observed_homs'
    ]].rename(
        {
            's': 'sample_id',
            'inbreeding.f_stat': 'inbreedingFStatY',
            'inbreeding.n_called': 'inbreedingCallsY',
            'inbreeding.expected_homs': 'inbreedingExpectedHomY',
            'inbreeding.observed_homs': 'inbreedingObservedHomY'
        },
        axis='columns'
    ).set_index('sample_id'),
    how='left'
)

path = args.prefix + '.vcf_sample_qc.tsv'
print("Writing collected metrics to", path)
with create_fh(path, 'w') as w:
    sample_df.to_csv(
        w,
        sep='\t'
    )

print("Finished generating primary outputs")
print("The summary file may take a while to finish")
print("If you are not interested in the summary file, you may cancel the job now")
output_summary(generate_summary(summary), args.prefix+"_var_qc.tsv")

We have tried to generate a MT from the hl.agg.inbreeding output (shown below), but that is not allowed as the function: hl.agg_inbreeding creates a StructExpression that does not work with the ‘write’ command. Does anyone have suggestions for how to increase the parallelism of this component and decrease run time? Thank you!

Blockquote
mt = hl.variant_qc(mt, ‘qc’)
chrX_good_variants= mt.filter_rows(
(
# Only keep ChrX non-monomorphic variants with all filters passed
# Used for heterozygosity sex check
# (vcf.filter_pass==True) & (vcf.locus.in_x_nonpar())
(hl.len(mt.filters) == 0) & (mt.locus.in_x_nonpar())
& (mt.qc.AC[0] >= 1) & (mt.qc.AC[1] >= 1)
),
keep=True
)
summary.append((‘SexCheck’, chrX_good_variants))
print(“Exporting heterozygosity/inbreeding statistics (Chr X)”)

inbreedingX=hl.agg.inbreeding(chrX_good_variants.GT, chrX_good_variants.qc.AF[1])
inbreedingX.write('inbreedingX.mt')
inbreedingX_mt=read_matrix_table('inbreedingX.mt')

sample_df = sample_df.join(
    chrX_good_variants.annotate_cols(
        inbreedingX_mt
    ).cols().to_pandas()[[
        's', 'inbreeding.f_stat', 'inbreeding.n_called',
        'inbreeding.expected_homs', 'inbreeding.observed_homs'
    ]].rename(
        {
            's': 'sample_id',
            'inbreeding.f_stat': 'inbreedingFStatX',
            'inbreeding.n_called': 'inbreedingCallsX',
            'inbreeding.expected_homs': 'inbreedingExpectedHomX',
            'inbreeding.observed_homs': 'inbreedingObservedHomX'
        },
        axis='columns'
    ).set_index('sample_id'),
    how='left'
)

# Also add y nonpar heterozygosity columns

chrY_good_variants= mt.filter_rows(
    (
        # Only keep Chry non-monomorphic variants with all filters passed
        # Used for heterozygosity sex check
        (hl.len(mt.filters) == 0) & (mt.locus.in_y_nonpar())
        & (mt.qc.AC[0] >= 1) & (mt.qc.AC[1] >= 1)
    ),
    keep=True
)
summary.append(('SexCheckY', chrY_good_variants))
print("Exporting heterozygosity/inbreeding statistics (Chr Y)")

inbreedingY=hl.agg.inbreeding(chrY_good_variants.GT, chrY_good_variants.qc.AF[1])
inbreedingY.write('inbreedingY.mt')
inbreedingY_mt=read_matrix_table('inbreedingY.mt')

sample_df = sample_df.join(
    chrY_good_variants.annotate_cols(
        inbreedingY_mt
    ).cols().to_pandas()[[
        's', 'inbreeding.f_stat', 'inbreeding.n_called',
        'inbreeding.expected_homs', 'inbreeding.observed_homs'
    ]].rename(
        {
            's': 'sample_id',
            'inbreeding.f_stat': 'inbreedingFStatY',
            'inbreeding.n_called': 'inbreedingCallsY',
            'inbreeding.expected_homs': 'inbreedingExpectedHomY',
            'inbreeding.observed_homs': 'inbreedingObservedHomY'
        },
        axis='columns'
    ).set_index('sample_id'),
    how='left'
)

path = args.prefix + '.vcf_sample_qc.tsv'
print("Writing collected metrics to", path)
with create_fh(path, 'w') as w:
    sample_df.to_csv(
        w,
        sep='\t'
    )

print("Finished generating primary outputs")
print("The summary file may take a while to finish")
print("If you are not interested in the summary file, you may cancel the job now")
output_summary(generate_summary(summary), args.prefix+"_var_qc.tsv")