I am trying to run the following component of our Variant QC script to find variants that fall outside of HWE. We are filtering variants by MAF thresholds: 0.01 for all ancestry groups, save for the European group, for which our threshold is 0.001.
def flag_hwe(ds, pval_threshold, n_participants):
'''
Perform sex aware HWE tests on autosomes and chrX for hg38
'''
populations = ds.aggregate_cols(hl.agg.counter(ds.sa.POP)) # produces a dict of population
if None in populations:
print("WARNING: Skipping HWE test for samples of undefined population")
del populations[None]
hwe_tests = {}
# Pre-queue the 3 hwe tests for each population
for pop in populations:
#ds_subpop=hl.agg.filter(ds.sa.POP == pop)
ds_subpop=hl.variant_qc(ds.filter_cols(ds.sa.POP == pop))
if pop=='EUR':
maf_thr=0.001
elif pop=='AFR':
maf_thr=0.01
elif pop=='SAS':
maf_thr=0.01
elif pop=='EAS':
maf_thr=0.01
ds_subpop = ds_subpop.filter_rows(ds_subpop.variant_qc.AF[1] > maf_thr)
hwe_tests['hwe{}'.format(pop)] = hl.cond(
# Filter 1) hwe{Pop}
# If variant is autosomal and there are enough samples in the population, apply HWE
ds_subpop.locus.in_autosome() & (hl.agg.count_where(ds_subpop.sa.POP == pop) >= n_participants),
hl.agg.filter(ds_subpop.sa.POP == pop, hl.agg.hardy_weinberg_test(ds_subpop.GT)),
# If variant is not autosomal, but in X nonpar and there are enough females in the population, apply HWE
hl.cond(
ds.locus.in_x_nonpar() & (hl.agg.count_where((ds.sa.POP == pop) & (ds.sa.GENDER == "Female")) >= n_participants),
hl.agg.filter((ds.sa.POP == pop) & (ds.sa.GENDER == "Female"), hl.agg.hardy_weinberg_test(ds.GT)),
hl.struct(het_freq_hwe=-1, p_value=-1) # Fallback: Not enough samples for this population
)
)
hwe_tests['hwe{}FemalePAR'.format(pop)] = hl.cond(
# Filter 2) hwe{Pop}Female
# If this is an X par variant and there are enough females in the population, apply HWE
ds.locus.in_x_par() & (hl.agg.count_where((ds.sa.POP == pop) & (ds.sa.GENDER == "Female")) >= n_participants),
hl.agg.filter((ds.sa.POP == pop) & (ds.sa.GENDER == "Female"), hl.agg.hardy_weinberg_test(ds.GT)),
hl.struct(het_freq_hwe=-1, p_value=-1) # Fallback: Not enough samples for this population
)
hwe_tests['hwe{}MalePAR'.format(pop)] = hl.cond(
# Filter 2) hwe{Pop}Male
# If this is an X par variant and there are enough males in the population, apply HWE
ds.locus.in_x_par() & (hl.agg.count_where((ds.sa.POP == pop) & (ds.sa.GENDER == "Male")) >= n_participants),
hl.agg.filter((ds.sa.POP == pop) & (ds.sa.GENDER == "Male"), hl.agg.hardy_weinberg_test(ds.GT)),
hl.struct(het_freq_hwe=-1, p_value=-1) # Fallback: Not enough samples for this population
)
hwe_tests['hwe{}FemalenonPAR'.format(pop)] = hl.cond(
# Filter 3) hwe{Pop}nonPAR
# If this is an X par variant and there are enough subjects in the population, apply HWE
ds.locus.in_x_nonpar() & (hl.agg.count_where((ds.sa.POP == pop) & (ds.sa.GENDER == "Female")) >= n_participants),
hl.agg.filter((ds.sa.POP == pop), hl.agg.hardy_weinberg_test(ds.GT)),
hl.struct(het_freq_hwe=-1, p_value=-1) # Fallback: Not enough samples for this population
)
hwe_tests['hwe{}MalenonPAR'.format(pop)] = hl.cond(
# Filter 3) hwe{Pop}nonPAR
# If this is an X par variant and there are enough subjects in the population, apply HWE
ds.locus.in_x_nonpar() & (hl.agg.count_where((ds.sa.POP == pop) & (ds.sa.GENDER == "Male")) >= n_participants),
hl.agg.filter((ds.sa.POP == pop), hl.agg.hardy_weinberg_test(ds.GT)),
hl.struct(het_freq_hwe=-1, p_value=-1) # Fallback: Not enough samples for this population
)
# execute tests
ds = ds.annotate_rows(
hweByPop=hl.struct(
**hwe_tests
)
)
hwe_info = {}
# Pre-queue the 3 info annotations for each population
# This exports variant.hwe{}.p_value to an info field variant.info.pHWE{}
for pop in populations:
hwe_info['pHWE{}'.format(pop)] = ds.hweByPop['hwe{}'.format(pop)].p_value
hwe_info['pHWE{}FemalePAR'.format(pop)] = ds.hweByPop['hwe{}FemalePAR'.format(pop)].p_value
hwe_info['pHWE{}MalePAR'.format(pop)] = ds.hweByPop['hwe{}MalePAR'.format(pop)].p_value
hwe_info['pHWE{}FemalenonPAR'.format(pop)] = ds.hweByPop['hwe{}FemalenonPAR'.format(pop)].p_value
hwe_info['pHWE{}MalenonPAR'.format(pop)] = ds.hweByPop['hwe{}MalenonPAR'.format(pop)].p_value
# execute annotations
ds = ds.annotate_rows(
info=ds.info.annotate(
**hwe_info
)
)
# Add filter flags
for pop in populations:
# Now flag variants based on the failed populations
ds = ds.annotate_rows(
filters=hl.cond(
(ds.info['pHWE{}'.format(pop)] != -1) & (ds.info['pHWE{}'.format(pop)] < pval_threshold),
ds.filters.add('HWE{}'.format(pop)),
ds.filters
)
)
ds = ds.annotate_rows(
filters=hl.cond(
(ds.info['pHWE{}FemalePAR'.format(pop)] != -1) & (ds.info['pHWE{}FemalePAR'.format(pop)] < pval_threshold),
ds.filters.add('HWE{}FemalePAR'.format(pop)),
ds.filters
)
)
ds = ds.annotate_rows(
filters=hl.cond(
(ds.info['pHWE{}MalePAR'.format(pop)] != -1) & (ds.info['pHWE{}MalePAR'.format(pop)] < pval_threshold),
ds.filters.add('HWE{}MalePAR'.format(pop)),
ds.filters
)
)
ds = ds.annotate_rows(
filters=hl.cond(
(ds.info['pHWE{}FemalenonPAR'.format(pop)] != -1) & (ds.info['pHWE{}FemalenonPAR'.format(pop)] < pval_threshold),
ds.filters.add('HWE{}FemalenonPAR'.format(pop)),
ds.filters
)
)
ds = ds.annotate_rows(
filters=hl.cond(
(ds.info['pHWE{}MalenonPAR'.format(pop)] != -1) & (ds.info['pHWE{}MalenonPAR'.format(pop)] < pval_threshold),
ds.filters.add('HWE{}MalenonPAR'.format(pop)),
ds.filters
)
)
return ds
I have encountered the following error which I am hoping to better understand. Does this error result from referencing of two different MatrixTables: ds_subpop and ds? Thank you in advance.
Traceback (most recent call last):
File "hail_variant_qc_v11_run3_YL.py", line 550, in <module>
vcf = flag_hwe(vcf, args.pval_threshold, args.n_individuals)
File "hail_variant_qc_v11_run3_YL.py", line 142, in flag_hwe
hwe_tests['hwe{}'.format(pop)] = hl.cond(
File "/gpfs/fs1/home/mg1646/.conda/envs/variantqc_hailenv/lib/python3.8/site-packages/deprecated/classic.py", line 285, in wrapper_function
return wrapped_(*args_, **kwargs_)
File "<decorator-gen-736>", line 2, in cond
File "/gpfs/fs1/home/mg1646/.conda/envs/variantqc_hailenv/lib/python3.8/site-packages/hail/typecheck/check.py", line 584, in wrapper
return __original_func(*args_, **kwargs_)
File "/gpfs/fs1/home/mg1646/.conda/envs/variantqc_hailenv/lib/python3.8/site-packages/hail/expr/functions.py", line 351, in cond
return if_else(condition, consequent, alternate, missing_false)
File "<decorator-gen-738>", line 2, in if_else
File "/gpfs/fs1/home/mg1646/.conda/envs/variantqc_hailenv/lib/python3.8/site-packages/hail/typecheck/check.py", line 584, in wrapper
return __original_func(*args_, **kwargs_)
File "/gpfs/fs1/home/mg1646/.conda/envs/variantqc_hailenv/lib/python3.8/site-packages/hail/expr/functions.py", line 406, in if_else
indices, aggregations = unify_all(condition, consequent, alternate)
File "/gpfs/fs1/home/mg1646/.conda/envs/variantqc_hailenv/lib/python3.8/site-packages/hail/expr/expressions/base_expression.py", line 417, in unify_all
raise ExpressionException(
hail.expr.expressions.base_expression.ExpressionException: Cannot combine expressions from different source objects.
Found fields from 2 objects:
<hail.matrixtable.MatrixTable object at 0x2b7371b28670>: ['locus', 'sa', 'sa', 'GT']
<hail.matrixtable.MatrixTable object at 0x2b73719a8490>: ['locus', 'sa', 'GT']