Inbreeding Calculation Run Time

mgarcia · July 28, 2023, 3:42pm

Hello,

Our group is in the process of scaling up our hail sample QC script to process the nearly 500k WES samples on the UK Biobank Research Analysis Platform. This component of our script that generates inbreeding stats, and generates a summary .tsv file for these and prior calculated stats takes an overwhelming portion of the run time.

mt = hl.variant_qc(mt, 'qc')

chrX_good_variants= mt.filter_rows(
    (
        # Only keep ChrX non-monomorphic variants with all filters passed
        # Used for heterozygosity sex check
        # (vcf.filter_pass==True) & (vcf.locus.in_x_nonpar())
        (hl.len(mt.filters) == 0) & (mt.locus.in_x_nonpar())
        & (mt.qc.AC[0] >= 1) & (mt.qc.AC[1] >= 1)
    ),
    keep=True
)
summary.append(('SexCheck', chrX_good_variants))
print("Exporting heterozygosity/inbreeding statistics (Chr X)")

sample_df = sample_df.join(
    chrX_good_variants.annotate_cols(
        inbreeding=hl.agg.inbreeding(chrX_good_variants.GT, chrX_good_variants.qc.AF[1])
    ).cols().to_pandas()[[
        's', 'inbreeding.f_stat', 'inbreeding.n_called',
        'inbreeding.expected_homs', 'inbreeding.observed_homs'
    ]].rename(
        {
            's': 'sample_id',
            'inbreeding.f_stat': 'inbreedingFStatX',
            'inbreeding.n_called': 'inbreedingCallsX',
            'inbreeding.expected_homs': 'inbreedingExpectedHomX',
            'inbreeding.observed_homs': 'inbreedingObservedHomX'
        },
        axis='columns'
    ).set_index('sample_id'),
    how='left'
)

# Also add y nonpar heterozygosity columns

chrY_good_variants= mt.filter_rows(
    (
        # Only keep Chry non-monomorphic variants with all filters passed
        # Used for heterozygosity sex check
        (hl.len(mt.filters) == 0) & (mt.locus.in_y_nonpar())
        & (mt.qc.AC[0] >= 1) & (mt.qc.AC[1] >= 1)
    ),
    keep=True
)
summary.append(('SexCheckY', chrY_good_variants))
print("Exporting heterozygosity/inbreeding statistics (Chr Y)")

sample_df = sample_df.join(
    chrY_good_variants.annotate_cols(
        inbreeding=hl.agg.inbreeding(chrY_good_variants.GT, chrY_good_variants.qc.AF[1])
    ).cols().to_pandas()[[
        's', 'inbreeding.f_stat', 'inbreeding.n_called',
        'inbreeding.expected_homs', 'inbreeding.observed_homs'
    ]].rename(
        {
            's': 'sample_id',
            'inbreeding.f_stat': 'inbreedingFStatY',
            'inbreeding.n_called': 'inbreedingCallsY',
            'inbreeding.expected_homs': 'inbreedingExpectedHomY',
            'inbreeding.observed_homs': 'inbreedingObservedHomY'
        },
        axis='columns'
    ).set_index('sample_id'),
    how='left'
)

path = args.prefix + '.vcf_sample_qc.tsv'
print("Writing collected metrics to", path)
with create_fh(path, 'w') as w:
    sample_df.to_csv(
        w,
        sep='\t'
    )

print("Finished generating primary outputs")
print("The summary file may take a while to finish")
print("If you are not interested in the summary file, you may cancel the job now")
output_summary(generate_summary(summary), args.prefix+"_var_qc.tsv")

We have tried to generate a MT from the hl.agg.inbreeding output (shown below), but that is not allowed as the function: hl.agg_inbreeding creates a StructExpression that does not work with the ‘write’ command. Does anyone have suggestions for how to increase the parallelism of this component and decrease run time? Thank you!

Blockquote
mt = hl.variant_qc(mt, ‘qc’)
chrX_good_variants= mt.filter_rows(
(
# Only keep ChrX non-monomorphic variants with all filters passed
# Used for heterozygosity sex check
# (vcf.filter_pass==True) & (vcf.locus.in_x_nonpar())
(hl.len(mt.filters) == 0) & (mt.locus.in_x_nonpar())
& (mt.qc.AC[0] >= 1) & (mt.qc.AC[1] >= 1)
),
keep=True
)
summary.append((‘SexCheck’, chrX_good_variants))
print(“Exporting heterozygosity/inbreeding statistics (Chr X)”)

inbreedingX=hl.agg.inbreeding(chrX_good_variants.GT, chrX_good_variants.qc.AF[1])
inbreedingX.write('inbreedingX.mt')
inbreedingX_mt=read_matrix_table('inbreedingX.mt')

sample_df = sample_df.join(
    chrX_good_variants.annotate_cols(
        inbreedingX_mt
    ).cols().to_pandas()[[
        's', 'inbreeding.f_stat', 'inbreeding.n_called',
        'inbreeding.expected_homs', 'inbreeding.observed_homs'
    ]].rename(
        {
            's': 'sample_id',
            'inbreeding.f_stat': 'inbreedingFStatX',
            'inbreeding.n_called': 'inbreedingCallsX',
            'inbreeding.expected_homs': 'inbreedingExpectedHomX',
            'inbreeding.observed_homs': 'inbreedingObservedHomX'
        },
        axis='columns'
    ).set_index('sample_id'),
    how='left'
)

# Also add y nonpar heterozygosity columns

chrY_good_variants= mt.filter_rows(
    (
        # Only keep Chry non-monomorphic variants with all filters passed
        # Used for heterozygosity sex check
        (hl.len(mt.filters) == 0) & (mt.locus.in_y_nonpar())
        & (mt.qc.AC[0] >= 1) & (mt.qc.AC[1] >= 1)
    ),
    keep=True
)
summary.append(('SexCheckY', chrY_good_variants))
print("Exporting heterozygosity/inbreeding statistics (Chr Y)")

inbreedingY=hl.agg.inbreeding(chrY_good_variants.GT, chrY_good_variants.qc.AF[1])
inbreedingY.write('inbreedingY.mt')
inbreedingY_mt=read_matrix_table('inbreedingY.mt')

sample_df = sample_df.join(
    chrY_good_variants.annotate_cols(
        inbreedingY_mt
    ).cols().to_pandas()[[
        's', 'inbreeding.f_stat', 'inbreeding.n_called',
        'inbreeding.expected_homs', 'inbreeding.observed_homs'
    ]].rename(
        {
            's': 'sample_id',
            'inbreeding.f_stat': 'inbreedingFStatY',
            'inbreeding.n_called': 'inbreedingCallsY',
            'inbreeding.expected_homs': 'inbreedingExpectedHomY',
            'inbreeding.observed_homs': 'inbreedingObservedHomY'
        },
        axis='columns'
    ).set_index('sample_id'),
    how='left'
)

path = args.prefix + '.vcf_sample_qc.tsv'
print("Writing collected metrics to", path)
with create_fh(path, 'w') as w:
    sample_df.to_csv(
        w,
        sep='\t'
    )

print("Finished generating primary outputs")
print("The summary file may take a while to finish")
print("If you are not interested in the summary file, you may cancel the job now")
output_summary(generate_summary(summary), args.prefix+"_var_qc.tsv")

Topic		Replies	Views
Code check to run WES Hail Query & hailctl	2	553	July 8, 2020
Filtering samples with extreme heterozygosity in hail? Hail Query & hailctl	7	1231	February 19, 2020
Compound heterozygote analysis? Hail Query & hailctl	3	449	May 17, 2022
Unable to do sample/variant QC after combining MatrixTable Hail Query & hailctl	11	418	January 8, 2023
Counting Rows More Quickly in VDS Hail Query & hailctl	12	528	July 17, 2023

Inbreeding Calculation Run Time

Related topics