TLDR I don’t know how to dynamically trigger regression fit only in a sparse pattern in my grid of variants times phenotypes.
Hi! I was trying to do this one gwas-related thing in hail and I’m bit stuck.
It’s a bit long to spell out
I wanted to do a simple simulation, where I’m looking at extreme p-values obtained with a group of control-samples and many fake binary phenotypes. So, I’d like to be able to quickly do gwases for many binary traits, but I’m only interested in the lowest p-value from each single gwas.
Say I have a matrix table which looks like this:
# fake genotypes
hl.set_global_seed(1)
n_variants = 2_000
mt = hl.balding_nichols_model(
6, 200, n_variants
).select_globals(
).select_cols(
).select_rows(
).rename(
{"sample_idx": "s"}
).cache()
# fake phenotypes (ys)
n_ys = 160
y_names = [f"y_{i + 1}" for i in range(n_ys)]
mt = mt.annotate_cols(
**{y_name: hl.rand_bool(0.2)\
for y_name in y_names
}
)
# fake phenotypes (covariates)
mt = mt.annotate_cols(
is_female = hl.rand_bool(0.5),
age = hl.int(hl.rand_unif(18, 75))
)
# PCs
n_pcs = 3
_, pcs, _ = hl.hwe_normalized_pca(mt.GT, k=n_pcs)
mt = mt.annotate_cols(scores = pcs[mt.s].scores)
And now I’m interested in the min_pvals
, a distribution of p-values obtained in the following way
gwas = hl.logistic_regression_rows(
"firth",
y = [mt[y] for y in y_names],
x = mt.GT.n_alt_alleles(),
covariates = [
1,
mt.age,
mt.is_female,
*[mt.scores[i] for i in range(n_pcs)]
]
)
min_pvals = gwas.aggregate(
hl.agg.array_agg(
hl.agg.min,
gwas.logistic_regression.p_value
)
)
min_pvals = np.sort(np.array(min_pvals))
Given which, I’d like to ask about e.g. the 5th lower percentile of those minimal p-values.
plt.plot(-np.log10(min_pvals), 'o-')
significance_0 = -np.log10(0.05 / n_variants)
significance_1 = -np.log10(np.percentile(min_pvals, 5))
plt.plot([0, len(min_pvals)], [significance_0, significance_0], "--", label="bonferroni")
plt.plot([0, len(min_pvals)], [significance_1, significance_1], "--", label='simulated')
plt.ylabel("$-log_{10}(\min($p-value$))$", fontsize=16)
plt.legend()
plt.show()
I’d like to do that with larger nonfake mt
, more covariates, and higher values of n_ys
. And preferably many times over various sets of samples.
As far as I can tell, hl.linear_regression_rows
is faster than hl.logistic_regression_rows("wald")
which again is faster than hl.logistic_regression_rows("firth")
. But, given a binary phenotype, p-value from firth
is more desirable than from wald
which again is more desirable than one from linear_regression
.
So, I had this idea:
- Do gwas with linear regression (all variants x all phenotypes).
- For each phenotype compute lower, say 10th, percentile of the p-values.
- For each phenotype, take only those variants whose p-values are below this percentile and fit them with Wald logistic regression. Swap the p-values from linear regression with the new ones from Wald.
- For each phenotype, using only those variants where Wald was fitted, again compute 10th percentile.
- For each phenotype, take only those variants where Wald was fitted and whose p-value is below this last percentile (or where the Wald regression fit did not converge), and fit them with Firth logistic regression. Swap the p-values from Wald with the new ones from Firth.
- For each phenotype note it’s minimal p-value.
- Hope that this is faster but gives roughly the same result (in terms of distribution of the obtained minimal p-values) as simply fitting Firth everywhere.
I’ve tried the following implementation, but it’s actually slower than simpy doing Firth. I guess it’s actually recomputing all p-values with all methods, rather than using the more expensive methods only for some variant-phenotype pairs.
def wald_regression_rows(**kwargs):
return hl.logistic_regression_rows(test="wald", **kwargs)
def firth_regression_rows(**kwargs):
return hl.logistic_regression_rows(test="firth", **kwargs)
def is_array(dtype):
try:
dtype.typecheck([])
return True
except:
return False
def get_array_fields(ht):
nonkey_fields = lambda : filter(
lambda _: _[0] not in ht.key.dtype,
zip(ht.row, ht.row.dtype.types)
)
array_fields = [
name for name, dtype in nonkey_fields()\
if is_array(dtype)
]
nonarray_fields = [
name for name, dtype in nonkey_fields()\
if not is_array(dtype)
]
return array_fields, nonarray_fields
def gwas_ht2mt(ht, y_names):
# Convert hail table returned
# e.g. by linear_regression_rows
# to matrix table with ys as columns
array_fields, nonarray_fields = get_array_fields(ht)
ht = ht.annotate(
column = hl.enumerate(y_names).map(
lambda _: hl.struct(
idx = _[0],
name = _[1]
)
)
)
ht = ht.explode(ht.column)
ht = ht.annotate(**{
field: ht[field][ht.column.idx]\
for field in array_fields
})
mt_ = ht.to_matrix_table(
row_key=ht.key.dtype.fields,
col_key=["column"],
row_fields = nonarray_fields
)
return mt_
def do_gwas(
hl_func,
mt,
y_names=y_names,
n_pcs=n_pcs
):
kwargs = {
"y": [mt[y_name] for y_name in y_names],
"x": mt.GT.n_alt_alleles(),
"covariates": [
1,
mt.age,
mt.is_female,
*[mt.scores[i] for i in range(n_pcs)]
]
}
gwas_ht = hl_func(**kwargs)
gwas_mt = gwas_ht2mt(gwas_ht, y_names)
return gwas_mt
def do_gwas_mixed(
mt,
pval_percentile_lin=0.1,
pval_percentile_wald=0.1
):
gwas_lin = do_gwas(hl.linear_regression_rows, mt)
gwas_wald = do_gwas(wald_regression_rows, mt)
gwas_firth = do_gwas(firth_regression_rows, mt)
gwas = gwas_lin.annotate_entries(
lin = gwas_lin.entry
).drop(*gwas_lin.entry)
gwas = gwas.annotate_cols(
pval_percentile = hl.struct(
lin = hl.agg.approx_quantiles(
gwas.lin.p_value,
pval_percentile_lin
)
)
)
gwas = gwas.annotate_entries(
wald = hl.case().when(
(gwas.lin.p_value < gwas.pval_percentile.lin) |
hl.is_nan(gwas.lin.p_value),
gwas_wald[gwas.row_key, gwas.col_key].logistic_regression
).or_missing()
)
gwas = gwas.annotate_cols(
pval_percentile = gwas.pval_percentile.annotate(
wald = hl.agg.approx_quantiles(
gwas.wald.p_value,
pval_percentile_wald
)
)
)
gwas = gwas.annotate_entries(
firth = hl.case().when(
(gwas.wald.p_value < gwas.pval_percentile.wald) |\
~gwas.wald.fit.converged,
gwas_firth[gwas.row_key, gwas.col_key].logistic_regression
).or_missing()
)
gwas = gwas.annotate_entries(
p_value = hl.case()\
.when(
hl.is_defined(gwas.firth) &\
gwas.firth.fit.converged,
gwas.firth.p_value
)\
.when(
hl.is_defined(gwas.wald) &\
gwas.wald.fit.converged,
gwas.wald.p_value
)\
.default(
gwas.lin.p_value
)
)
return gwas
Apologies for this long, ugly snippet.
So, this doesn’t work, but perhaps it explains what I’m trying to do.
Simple timings in my jupyter:
%%timeit
do_gwas(hl.linear_regression_rows, mt).p_value.collect()
>>> 4.94 s ± 288 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%%timeit
do_gwas(wald_regression_rows, mt).logistic_regression.p_value.collect()
>>> 13 s ± 1.11 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
%%timeit
do_gwas(firth_regression_rows, mt).logistic_regression.p_value.collect()
>>> 38.9 s ± 248 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%%timeit
do_gwas_mixed(mt).p_value.collect()
>>> 58.3 s ± 575 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
I suppose I should just use wald
, but If anyone has any suggestions, about what is the best way to obtain my distribution of min_pvals
, I would love to hear it!
Thanks!