How to run GWAS from UK Biobank efficiently on Hail

Thanks a lot, @danking! The whole time for running chr21 is about 12 minutes now.

Below is my current scripts:

import hail as hl
import pandas as pd
import os
import sys
chr = sys.argv[1]
pwd = ‘/test/imp_data/’
os.chdir(pwd)
data_path=‘/data/raw/UKB-41476/’
bgenFile = data_path+‘imp_data/ukb_imp_chr’+chr+‘_v3.bgen’
sampleFile = data_path+‘imp_data/ukb53562_imp_chr’+chr+‘_v3_s487296.sample’
mfiFile = data_path+‘imp_data/ukb_mfi_chr’+chr+‘_v3.txt’

initiate hail environment

hl.init(min_block_size=128)

import phenotype info

sample_info = hl.import_table(
pwd+‘/target_pheno_1190.csv’,
key=‘f_eid’,
types={
‘f_eid’:hl.tstr, ‘female’:hl.tbool, ‘age’:hl.tfloat64,
‘f_1190_0_0’:hl.tfloat64, ‘shift_worker’:hl.tbool,
‘PC1’:hl.tfloat64, ‘PC2’:hl.tfloat64,
‘PC3’:hl.tfloat64, ‘PC4’:hl.tfloat64
},
delimiter=‘,’
)
mfi_info = hl.import_table(
pwd+‘/mfi_info_chr’+chr+‘.tsv’,
types={
‘locus’:hl.tlocus(reference_genome=‘GRCh37’),
‘bp’:hl.tstr, ‘INFO’:hl.tfloat64
},
delimiter=‘\t’
)

mfi_info = mfi_info.key_by(
locus = mfi_info.locus,
alleles = mfi_info.alleles.split(‘,’)
)

import genotype information

genotype_info = hl.import_bgen(
path=bgenFile,
entry_fields=[‘GT’],
sample_file=sampleFile
)

add mfi information to variants

data=genotype_info.annotate_rows(mfi=mfi_info[genotype_info.row_key])
data=data.filter_rows(data.mfi.INFO>0.8)

varinat qc to get allele frequency

data = hl.variant_qc(data)

data = data.filter_rows(
(data.variant_qc.AF[1]>=0.001)
&(data.variant_qc.p_value_hwe>1e-10) # filter by HWE
)

add phenotype info to matrixTable columns

data=data.annotate_cols(pheno=sample_info[data.col.s])

filter out NA samples

data=data.filter_cols(~hl.is_nan(data.pheno.f_1190_0_0))

run association anaysis using linear regression

gwas = hl.linear_regression_rows(
y=data.pheno.f_1190_0_0,
x=data.GT.n_alt_alleles(),
covariates=[1.0,
data.pheno.female, # a logistic variable
data.pheno.shift_worker, # a logistic variable
data.pheno.age,
data.pheno.PC1,
data.pheno.PC2,
data.pheno.PC3,
data.pheno.PC4
],
pass_through=[data.variant_qc.AF,data.mfi.bp]
)

add allele freqency and rsid to gwas result

gwas = gwas.annotate(chr=hl.str(chr))

gwas = gwas.select(
‘n’, ‘beta’,
‘standard_error’,
‘p_value’, ‘chr’,
bp=hl.str(gwas.bp),
oa=gwas.alleles[0],
ea=gwas.alleles[1],
eaf=gwas.AF[1]
)

gwas.write(‘out_path/nap_chr’+chr+‘_gwas.ht’)
t = hl.read_table(‘out_path/nap_chr’+chr+‘_gwas.ht’)
t = t.key_by()
t = t.annotate(variant = hl.variant_str(t.locus, t.alleles))

t = t.select(
‘variant’,‘chr’,‘bp’,‘oa’,
‘ea’,‘eaf’,‘n’, ‘beta’,
‘standard_error’,‘p_value’
)

t.export(‘out_path/nap_chr’+chr+‘_gwas.tsv.bgz’, header=True, delimiter=‘\t’)
hl.stop()

Do you think it can be further improved? The current running time for a gwas is about 18 hours,which is slightly acceptable.