Hi!
I am running a whole exome sequencing analysis using logistic Firth test and generating several qc metric.
I wonder my codes make sense.
The output result seems to make sense to me, but it would be nice to make sure my understanding and codes are correct.
Can anyone have a look on the following codes and provide feed back?
Thank you so much!
Sean
import hail as hl
hl.init()
from pprint import pprint
from bokeh.io import output_notebook, show
from pprint import pprint
output_notebook()
#Read in data matrix
mt=hl.read_matrix_table('test.mt')
#Filter genotypes by GQ>=20
mtGQ20=mt.filter_entries(mt.GQ>=20)
#Perform variant QC
mtGQ20=hl.variant_qc(mtGQ20)
#Calculate callrate for cases and controls separately
mtGQ20=mtGQ20.annotate_rows(
callrate_U=hl.agg.filter(mtGQ20.pheno.T2D==1, hl.agg.count_where(hl.is_defined(mtGQ20.GT))/mtGQ20.aggregate_cols(hl.agg.count_where(mtGQ20.pheno.T2D==1))),
callrate_A=hl.agg.filter(mtGQ20.pheno.T2D==2, hl.agg.count_where(hl.is_defined(mtGQ20.GT))/mtGQ20.aggregate_cols(hl.agg.count_where(mtGQ20.pheno.T2D==2))))
#Get number of samples for cases and controls
mtGQ20=mtGQ20.annotate_rows(
n_case=hl.agg.count_where(mtGQ20.pheno.T2D==2),
n_cont=hl.agg.count_where(mtGQ20.pheno.T2D==1)
)
#Prepare number of missing/non-missing by case/control for missingness test
mtGQ20=mtGQ20.annotate_rows(
nonT2DnonMiss=hl.agg.filter(mtGQ20.pheno.T2D==1, hl.agg.count_where(hl.is_defined(mtGQ20.GT))),
nonT2DMiss=hl.agg.filter(mtGQ20.pheno.T2D==1, mtGQ20.aggregate_cols(hl.agg.count_where(mtGQ20.pheno.T2D==1))-hl.agg.count_where(hl.is_defined(mtGQ20.GT))),
T2DnonMiss=hl.agg.filter(mtGQ20.pheno.T2D==2, hl.agg.count_where(hl.is_defined(mtGQ20.GT))),
T2DMiss=hl.agg.filter(mtGQ20.pheno.T2D==2, mtGQ20.aggregate_cols(hl.agg.count_where(mtGQ20.pheno.T2D==2))-hl.agg.count_where(hl.is_defined(mtGQ20.GT))))
#Perform Fisher exact test for case/control missingness using above
mtGQ20=mtGQ20.annotate_rows(
TMISS=hl.fisher_exact_test(
hl.int32(mtGQ20.nonT2DnonMiss), hl.int32(mtGQ20.nonT2DMiss), hl.int32(mtGQ20.T2DnonMiss), hl.int32(mtGQ20.T2DMiss)
)
)
#Perform HWE test for cases and controls separately
mtGQ20=mtGQ20.annotate_rows(
HWE_U=hl.agg.filter(mtGQ20.pheno.T2D==1, hl.hardy_weinberg_test(
hl.int32(hl.agg.count_where(mtGQ20.GT.is_hom_ref())),
hl.int32(hl.agg.count_where(mtGQ20.GT.is_het_ref())),
hl.int32(hl.agg.count_where(mtGQ20.GT.is_hom_var())))),
HWE_A=hl.agg.filter(mtGQ20.pheno.T2D==2, hl.hardy_weinberg_test(
hl.int32(hl.agg.count_where(mtGQ20.GT.is_hom_ref())),
hl.int32(hl.agg.count_where(mtGQ20.GT.is_het_ref())),
hl.int32(hl.agg.count_where(mtGQ20.GT.is_hom_var())))))
#Perform logistic Firth test
logistf_GQ20=hl.logistic_regression_rows(test='firth', y=mtGQ20.pheno.t2d, x=mtGQ20.GT.n_alt_alleles(), covariates=[1.0])
#Annotate Firth result
mtGQ20=mtGQ20.annotate_rows(
firth_beta=logistf_GQ20[mtGQ20.row_key].beta,
firth_chisq=logistf_GQ20[mtGQ20.row_key].chi_sq_stat,
firth_p=logistf_GQ20[mtGQ20.row_key].p_value,
)
#Calculate call stats by case/control status to extract AC, AF by cases and controls
mtGQ20=mtGQ20.annotate_rows(
callstats_U=hl.agg.filter(mtGQ20.pheno.T2D==1, hl.agg.call_stats(mtGQ20.GT, mtGQ20.alleles)),
callstats_A=hl.agg.filter(mtGQ20.pheno.T2D==2, hl.agg.call_stats(mtGQ20.GT, mtGQ20.alleles)))
#gnomAD WES RF filter
ge = hl.read_table('gs://gnomad-public/release/2.1.1/ht/exomes/gnomad.exomes.r2.1.1.sites.ht')
mtGQ20=mtGQ20.annotate_rows(gnomadExomes=ge[mtGQ20.row_key])
mtGQ20=mtGQ20.annotate_rows(gEfilter=(mtGQ20.gnomadExomes.filters.length()>0) | mtGQ20.gnomadExomes.filters.contains("RF"))
#gnomAD WGS RF filter
gg = hl.read_table('gs://gnomad-public/release/2.1.1/ht/genomes/gnomad.genomes.r2.1.1.sites.ht')
mtGQ20=mtGQ20.annotate_rows(gnomadGenomes=gg[mtGQ20.row_key])
mtGQ20=mtGQ20.annotate_rows(gGfilter=(mtGQ20.gnomadGenomes.filters.length()>0) | mtGQ20.gnomadGenomes.filters.contains("RF"))
#Perform logistic Wald test
wald_GQ20=hl.logistic_regression_rows(test='wald', y=mtGQ20.pheno.t2d, x=mtGQ20.GT.n_alt_alleles(), covariates=[1.0])
mtGQ20=mtGQ20.annotate_rows(wald_p=wald_GQ20[mtGQ20.row_key].p_value)
#Select columns to output
variant_annotation=mtGQ20.rows()
variant_annotation_select=variant_annotation.select(
Rsid=variant_annotation.rsid,
NCASE=variant_annotation.n_case,
NCONT=variant_annotation.n_cont,
AC_REF=variant_annotation.variant_qc.AC[0],
AC_ALT=(variant_annotation.variant_qc.AN - variant_annotation.variant_qc.AC[0]),
AF_REF=variant_annotation.variant_qc.AF[0],
AF_ALT=(1-variant_annotation.variant_qc.AF[0]),
AAF_CASE=(1-variant_annotation.callstats_A.AF[0]),
AAF_CONT=(1-variant_annotation.callstats_U.AF[0]),
CallRate=variant_annotation.variant_qc.call_rate,
CallRate_U=variant_annotation.callrate_U,
CallRate_A=variant_annotation.callrate_A,
P_miss=variant_annotation.TMISS.p_value,
HWE=variant_annotation.variant_qc.p_value_hwe,
HWE_U=variant_annotation.HWE_U.p_value,
HWE_A=variant_annotation.HWE_A.p_value,
DP_mean=variant_annotation.variant_qc.dp_stats.mean,
GQ_mean=variant_annotation.variant_qc.gq_stats.mean,
GnomAD_Exome=variant_annotation.gEfilter,
GnomAD_Genome=variant_annotation.gGfilter,
Firth_Beta=variant_annotation.firth_beta,
Firth_Chisq=variant_annotation.firth_chisq,
Firth_P=variant_annotation.firth_p,
Wald_P=variant_annotation.wald_p
)
#Write output
variant_annotation_select.export('output.tsv')