Yes! It’s long and a bit convoluted, but here you go:
# set up hail with appropriate reference chromosome and sparkConf
import hail as hl
import pandas as pd
import datetime as d
hl.init(default_reference='GRCh37',spark_conf={'spark.driver.memory': '250G'})
# get useful things for later
from hail.plot import show
from pprint import pprint
hl.plot.output_notebook()
# import chr16 bgen as a MatrixTable
ef = ['GT', 'GP','dosage']
hl.index_bgen("ukb_imp_chr16_v3.bgen",contig_recoding={"01": "1"})
chr16 = hl.import_bgen("ukb_imp_chr16_v3.bgen", entry_fields=ef,sample_file="ukb52480_imp_chr16_v3_s487314.sample")
chr16.describe()
# import phenotype
phenos = pd.read_pickle("./phenos.pkl")
# then convert to Hail table object
phenos.columns = ["eid","BMI"]
ph = hl.Table.from_pandas(phenos,key=["eid"])
# remove individuals with NaN BMI values
ph_filtered = ph.filter(hl.is_nan(ph.BMI),keep=False)
pprint(ph.count()-ph_filtered.count())
# join phenotype table onto matrixtable
chr16 = chr16.annotate_cols(phenotype = ph_filtered[chr16.s])
# get biological sex and ethnicity from uk biobank data and add it to table
bios = pd.read_csv("ukb37493(2).csv",
usecols = ["eid","22001-0.0","22006-0.0"],
dtype={"eid":str,"22001-0.0":float,"22006-0.0":float})
# then convert to Hail table object
bios.columns = ["eid","sex","ethnic_group"]
bs = hl.Table.from_pandas(bios,key=["eid"])
# remove individuals with NaN values
bs = bs.filter(hl.is_nan(bs.sex),keep=False)
bs = bs.filter(hl.is_nan(bs.ethnic_group),keep=False)
# join to matrixtable
chr16 = chr16.annotate_cols(ukbb = bs[chr16.s])
chr16.describe()
# exclude non-european individuals
chr16_qc1 = chr16.filter_cols(chr16.ukbb.ethnic_group == 1)
print('Samples: %d ' % (chr16_qc1.count_cols()))
# exclude individuals not used in Biobank PCA calculation
p = pd.read_csv("ukb37493(2).csv",
usecols = ["eid","22020-0.0"],
dtype={"eid":str,"22020-0.0":float})
p.columns = ["eid","pca_inclusions"]
pca_inclusions = hl.Table.from_pandas(p,key=["eid"])
pca_inclusions = pca_inclusions.filter(hl.is_nan(pca_inclusions.pca_inclusions),keep=False)
to_keep = pca_inclusions.eid.collect()
set_to_keep = hl.literal(to_keep)
print('%d individuals used in PCA calc' % hl.eval(hl.len(set_to_keep)))
chr16_qc2 = chr16_qc1.filter_cols(set_to_keep.contains(chr16_qc1['s']))
print('Samples removed: %d' % hl.eval(chr16_qc1.count_cols()-chr16_qc2.count_cols()))
# exclude biological and reported sex mismatches and import age for covariates
# read in reported sex and YOB, convert YOB to age, annotate matrixTable
p = pd.read_csv("ukb40744.csv", usecols = ["eid","31-0.0","34-0.0"],
dtype={"eid":str,"31-0.0":float,"34-0.0":float})
p.columns = ["eid","reported_sex","age"]
p.age = int(d.datetime.now().year) - p.age
pt = hl.Table.from_pandas(p, key=["eid"])
chr16_qc2 = chr16_qc2.annotate_cols(ukbb2 = pt[chr16_qc2.s])
#chr16_qc2.describe()
#chr16_qc2.ukbb2.show(2)
# filter for mismatches
chr16_qc2a = chr16_qc2.filter_cols(chr16_qc2.ukbb.sex == chr16_qc2.ukbb2.reported_sex)
chr16_qc2a.describe()
print('Samples removed: %d' % hl.eval(chr16_qc2.count_cols()-chr16_qc2a.count_cols()))
# import lists of recommended exclusions and weird cases, convert to tables, filter for NaNs
# recommended heterozygosity exclusions
he = pd.read_csv("ukb37493(2).csv",
usecols = ["eid","22010-0.0"],
dtype={"eid":str,"22010-0.0":float})
he.columns = ["eid","het_exclusions"]
het_exclusions = hl.Table.from_pandas(he,key=["eid"])
het_exclusions = het_exclusions.filter(hl.is_nan(het_exclusions.het_exclusions),keep=False)
# recommended relatedness exclusions
re = pd.read_csv("ukb37493(2).csv",
usecols = ["eid","22018-0.0"],
dtype={"eid":str,"22018-0.0":float})
re.columns = ["eid","rel_exclusions"]
rel_exclusions = hl.Table.from_pandas(re,key=["eid"])
rel_exclusions = rel_exclusions.filter(hl.is_nan(rel_exclusions.rel_exclusions),keep=False)
# aneuploidy cases
an = pd.read_csv("ukb37493(2).csv",
usecols = ["eid","22019-0.0"],
dtype={"eid":str,"22019-0.0":float})
an.columns = ["eid","aneuploidy"]
aneuploidy = hl.Table.from_pandas(an,key=["eid"])
aneuploidy = aneuploidy.filter(hl.is_nan(aneuploidy.aneuploidy),keep=False)
# remove samples from table if they appear in recommended exclusion lists
# heterozygosity exclusions
to_remove = het_exclusions.eid.collect()
set_to_remove = hl.literal(to_remove)
print('to remove: %d' % hl.eval(hl.len(set_to_remove)))
chr16_qc3 = chr16_qc2.filter_cols(~set_to_remove.contains(chr16_qc2['s']))
print('Samples removed: %d' % hl.eval(chr16_qc2.count_cols()-chr16_qc3.count_cols()))
# relatedness exclusions
to_remove = rel_exclusions.eid.collect()
set_to_remove = hl.literal(to_remove)
print('to remove: %d' % hl.eval(hl.len(set_to_remove)))
chr16_qc4 = chr16_qc3.filter_cols(~set_to_remove.contains(chr16_qc3['s']))
print('Samples removed: %d' % hl.eval(chr16_qc3.count_cols()-chr16_qc4.count_cols()))
# aneuploidy
to_remove = aneuploidy.eid.collect()
set_to_remove = hl.literal(to_remove)
print('to remove: %d' % hl.eval(hl.len(set_to_remove)))
chr16_qc5 = chr16_qc4.filter_cols(~set_to_remove.contains(chr16_qc4['s']))
print('Samples removed: %d' % hl.eval(chr16_qc4.count_cols()-chr16_qc5.count_cols()))
print('Total samples removed: %d' % hl.eval(chr16_qc1.count_cols()-chr16_qc5.count_cols()))
# calculate per-variant qc stats
chr16_qc5 = hl.variant_qc(chr16_qc5)
print('Total Variants: %d' % (chr16_qc5.count_rows()))
# remove non-SNPs
chr16_qc6 = hl.filter_alleles(chr16_qc5, lambda allele, i: hl.is_snp(chr16_qc5.alleles[0], allele))
chr16_qc6.describe()
print('Variants remaining: %d' % (chr16_qc6.count_rows()))
# HWE filter (Neale lab value)
chr16_qc7 = chr16_qc6.filter_rows(chr16_qc6.variant_qc.p_value_hwe > 1e-10)
print('Variants remaining: %d' % (chr16_qc7.count_rows()))
# info score filter -- using 0.4 as indicated by exploratory plot
# import and attach info score and MAF
infos = pd.read_csv("ukb_mfi_chr16_v3.txt", sep="\t", header=None)
ins = infos[[0,5,6,7]]
ins.columns = ["varid","MAF","minor_allele","info_score"]
i = hl.Table.from_pandas(ins,key=["varid"])
# join info score table onto matrixtable
chr16_qc7 = chr16_qc7.annotate_rows(SNP_info = i[chr16_qc7.varid])
chr16_qc8 = chr16_qc7.filter_rows(chr16_qc7.SNP_info.info_score > 0.4)
print('Variants remaining: %d' % (chr16_qc8.count_rows()))
# MAF filter
chr16_qc9 = chr16_qc8.filter_rows(chr16_qc8.SNP_info.MAF > 0.001)
print('Variants remaining: %d' % (chr16_qc9.count_rows()))
# import principal components calculated by uk biobank
p = pd.read_csv("ukb37493(2).csv",
usecols = ["eid","22009-0.1","22009-0.2","22009-0.3",
"22009-0.4","22009-0.5","22009-0.6",
"22009-0.7","22009-0.8","22009-0.9","22009-0.10"],
dtype={"eid":str,"22009-0.1":float,"22009-0.2":float,"22009-0.3":float,
"22009-0.4":float,"22009-0.5":float,"22009-0.6":float,
"22009-0.7":float,"22009-0.8":float,"22009-0.9":float,"22009-0.10":float})
p.columns = ["eid","pca1","pca2","pca3","pca4","pca5","pca6","pca7","pca8","pca9","pca10"]
pca = hl.Table.from_pandas(p,key=["eid"])
# join to matrixtable
chr16_qc9 = chr16_qc9.annotate_cols(PCA = pca[chr16_qc9.s])
chr16_qc9.describe()
# GWAS with dosages and confounders as covariates
gwas = hl.linear_regression_rows(y=chr16_qc9.phenotype.BMI,
x=hl.gp_dosage(chr16_qc9.GP),
covariates=[1.0, chr16_qc9.ukbb2.reported_sex,
chr16_qc9.ukbb2.age,
chr16_qc9.PCA.pca1, chr16_qc9.PCA.pca2,
chr16_qc9.PCA.pca3,chr16_qc9.PCA.pca4,
chr16_qc9.PCA.pca5, chr16_qc9.PCA.pca6,
chr16_qc9.PCA.pca7, chr16_qc9.PCA.pca8,
chr16_qc9.PCA.pca9,chr16_qc9.PCA.pca10])
gwas.row.describe()