Code check to run WES

Hi!
I am running a whole exome sequencing analysis using logistic Firth test and generating several qc metric.
I wonder my codes make sense.
The output result seems to make sense to me, but it would be nice to make sure my understanding and codes are correct.
Can anyone have a look on the following codes and provide feed back?
Thank you so much!
Sean

import hail as hl
hl.init()
from pprint import pprint
from bokeh.io import output_notebook, show
from pprint import pprint
output_notebook()

#Read in data matrix
mt=hl.read_matrix_table('test.mt')

#Filter genotypes by GQ>=20
mtGQ20=mt.filter_entries(mt.GQ>=20)

#Perform variant QC
mtGQ20=hl.variant_qc(mtGQ20)

#Calculate callrate for cases and controls separately
mtGQ20=mtGQ20.annotate_rows(
    callrate_U=hl.agg.filter(mtGQ20.pheno.T2D==1, hl.agg.count_where(hl.is_defined(mtGQ20.GT))/mtGQ20.aggregate_cols(hl.agg.count_where(mtGQ20.pheno.T2D==1))),
    callrate_A=hl.agg.filter(mtGQ20.pheno.T2D==2, hl.agg.count_where(hl.is_defined(mtGQ20.GT))/mtGQ20.aggregate_cols(hl.agg.count_where(mtGQ20.pheno.T2D==2))))

#Get number of samples for cases and controls
mtGQ20=mtGQ20.annotate_rows(
    n_case=hl.agg.count_where(mtGQ20.pheno.T2D==2),
    n_cont=hl.agg.count_where(mtGQ20.pheno.T2D==1)
)

#Prepare number of missing/non-missing by case/control for missingness test
mtGQ20=mtGQ20.annotate_rows(
    nonT2DnonMiss=hl.agg.filter(mtGQ20.pheno.T2D==1, hl.agg.count_where(hl.is_defined(mtGQ20.GT))),
    nonT2DMiss=hl.agg.filter(mtGQ20.pheno.T2D==1, mtGQ20.aggregate_cols(hl.agg.count_where(mtGQ20.pheno.T2D==1))-hl.agg.count_where(hl.is_defined(mtGQ20.GT))),
    T2DnonMiss=hl.agg.filter(mtGQ20.pheno.T2D==2, hl.agg.count_where(hl.is_defined(mtGQ20.GT))),
    T2DMiss=hl.agg.filter(mtGQ20.pheno.T2D==2, mtGQ20.aggregate_cols(hl.agg.count_where(mtGQ20.pheno.T2D==2))-hl.agg.count_where(hl.is_defined(mtGQ20.GT))))

#Perform Fisher exact test for case/control missingness using above
mtGQ20=mtGQ20.annotate_rows(
    TMISS=hl.fisher_exact_test(
        hl.int32(mtGQ20.nonT2DnonMiss), hl.int32(mtGQ20.nonT2DMiss), hl.int32(mtGQ20.T2DnonMiss), hl.int32(mtGQ20.T2DMiss)
    )
)

#Perform HWE test for cases and controls separately
mtGQ20=mtGQ20.annotate_rows(
    HWE_U=hl.agg.filter(mtGQ20.pheno.T2D==1, hl.hardy_weinberg_test(
        hl.int32(hl.agg.count_where(mtGQ20.GT.is_hom_ref())),
        hl.int32(hl.agg.count_where(mtGQ20.GT.is_het_ref())),
        hl.int32(hl.agg.count_where(mtGQ20.GT.is_hom_var())))),
    HWE_A=hl.agg.filter(mtGQ20.pheno.T2D==2, hl.hardy_weinberg_test(
        hl.int32(hl.agg.count_where(mtGQ20.GT.is_hom_ref())),
        hl.int32(hl.agg.count_where(mtGQ20.GT.is_het_ref())),
        hl.int32(hl.agg.count_where(mtGQ20.GT.is_hom_var())))))

#Perform logistic Firth test
logistf_GQ20=hl.logistic_regression_rows(test='firth', y=mtGQ20.pheno.t2d, x=mtGQ20.GT.n_alt_alleles(), covariates=[1.0])

#Annotate Firth result
mtGQ20=mtGQ20.annotate_rows(
    firth_beta=logistf_GQ20[mtGQ20.row_key].beta,
    firth_chisq=logistf_GQ20[mtGQ20.row_key].chi_sq_stat,
    firth_p=logistf_GQ20[mtGQ20.row_key].p_value,
)

#Calculate call stats by case/control status to extract AC, AF by cases and controls
mtGQ20=mtGQ20.annotate_rows(
    callstats_U=hl.agg.filter(mtGQ20.pheno.T2D==1, hl.agg.call_stats(mtGQ20.GT, mtGQ20.alleles)),
    callstats_A=hl.agg.filter(mtGQ20.pheno.T2D==2, hl.agg.call_stats(mtGQ20.GT, mtGQ20.alleles)))

#gnomAD WES RF filter
ge = hl.read_table('gs://gnomad-public/release/2.1.1/ht/exomes/gnomad.exomes.r2.1.1.sites.ht')
mtGQ20=mtGQ20.annotate_rows(gnomadExomes=ge[mtGQ20.row_key])
mtGQ20=mtGQ20.annotate_rows(gEfilter=(mtGQ20.gnomadExomes.filters.length()>0) | mtGQ20.gnomadExomes.filters.contains("RF"))

#gnomAD WGS RF filter
gg = hl.read_table('gs://gnomad-public/release/2.1.1/ht/genomes/gnomad.genomes.r2.1.1.sites.ht')
mtGQ20=mtGQ20.annotate_rows(gnomadGenomes=gg[mtGQ20.row_key])
mtGQ20=mtGQ20.annotate_rows(gGfilter=(mtGQ20.gnomadGenomes.filters.length()>0) | mtGQ20.gnomadGenomes.filters.contains("RF"))

#Perform logistic Wald test
wald_GQ20=hl.logistic_regression_rows(test='wald', y=mtGQ20.pheno.t2d, x=mtGQ20.GT.n_alt_alleles(), covariates=[1.0])
mtGQ20=mtGQ20.annotate_rows(wald_p=wald_GQ20[mtGQ20.row_key].p_value)

#Select columns to output
variant_annotation=mtGQ20.rows()
variant_annotation_select=variant_annotation.select(
    Rsid=variant_annotation.rsid,
    NCASE=variant_annotation.n_case,
    NCONT=variant_annotation.n_cont,
    AC_REF=variant_annotation.variant_qc.AC[0],
    AC_ALT=(variant_annotation.variant_qc.AN - variant_annotation.variant_qc.AC[0]),
    AF_REF=variant_annotation.variant_qc.AF[0],
    AF_ALT=(1-variant_annotation.variant_qc.AF[0]),
    AAF_CASE=(1-variant_annotation.callstats_A.AF[0]),
    AAF_CONT=(1-variant_annotation.callstats_U.AF[0]),
    CallRate=variant_annotation.variant_qc.call_rate,
    CallRate_U=variant_annotation.callrate_U,
    CallRate_A=variant_annotation.callrate_A,
    P_miss=variant_annotation.TMISS.p_value,
    HWE=variant_annotation.variant_qc.p_value_hwe,
    HWE_U=variant_annotation.HWE_U.p_value,
    HWE_A=variant_annotation.HWE_A.p_value,
    DP_mean=variant_annotation.variant_qc.dp_stats.mean,
    GQ_mean=variant_annotation.variant_qc.gq_stats.mean,
    GnomAD_Exome=variant_annotation.gEfilter,
    GnomAD_Genome=variant_annotation.gGfilter,
    Firth_Beta=variant_annotation.firth_beta,
    Firth_Chisq=variant_annotation.firth_chisq,
    Firth_P=variant_annotation.firth_p,
    Wald_P=variant_annotation.wald_p
)

#Write output
variant_annotation_select.export('output.tsv')

This all looks correct to me, but there are a couple of changes you might make to improve performance / clarity:


Define the number of cases and controls at the top, instead of computing them multiple times with aggregate_cols:

(number_U, number_A) = mtGQ20.aggregate_cols(
    (hl.agg.count_where(mtGQ20.pheno.T2D==1),
     hl.agg.count_where(mtGQ20.pheno.T2D==2)))

Then use number_U and number_A as denominators in computing call rate and nonT2DMiss/T2DMiss.


I’m pretty sure that the | mtGQ20.gnomadExomes.filters.contains("RF") here is unnecessary – if the filters contains “RF”, then the filters must have at least one element. You want to throw out anything that wasn’t PASS in gnomad, right?

Thank you so much!
This is wonderful.