Yes, and thank you for your help! This is the full script, including pieces unrelated to Hail.
print("Running gwas. using HAIL")
print("importing libraries")
import hail as hl
hl.init()
import pandas as pd
#import matplotlib as mpl
import numpy as np
# mpl.use('Agg')
# from matplotlib import pyplot as plt
import scipy.stats as ss
import sys
import os
acceptableSampleSizes = ["all","halfGwas1","halfGwas2","ckbN_halfGwas1","ckbN_halfGwas2","Q1_CKB-N-.5unrel","Q2_CKB-N-.5unrel","Q3_CKB-N-.5unrel","Q4_CKB-N-.5unrel",
"1of10unrel","2of10unrel","3of10unrel","4of10unrel"] + [str(x) for x in range(1,51)]
import argparse
parser = argparse.ArgumentParser(description='Run sibship for given list of SNPs (or summstats file).')
# parser.add_argument('--lociFile', dest='lociFile', type=str,
# help='File location for SNP chr list of SNPs (tab separated). If none provided, will run on all SNPs.',
# required=False, default="allSnps")
parser.add_argument('--analysisName', dest='analysisName', type=str,
help='Analysis name, used to name output.',required=True)
parser.add_argument('--chrom', dest='chrom', type=int,required=True,
help='Chrom of analysis (in UKB). Required, used to parallelize with multiple jobs over chroms.')
parser.add_argument('--relatedness', metavar='str', dest='relatedness', type=str, action='store', choices=["all","unrelated","WB"], default="WB",
help="Define which set of individuals to use, unrelateds or all. (default: WB for white british).")
parser.add_argument('--sampleSize', metavar='str', dest='sampleSize', type=str, action='store', choices=acceptableSampleSizes, default="all",
help="Define which sample size to use, all or predefined subset. (default: all).")
# parser.add_argument('--out', dest='out', type=str,
# help='Output location; UNUSED.',required=True)
# parser.add_argument('--lociFileRsidCol', dest='lociFileRsidCol', type=str,
# help='RSID column (to ID snps for analysis). default SNP',required=False,default="SNP")
# parser.add_argument('--lociFileChromCol', dest='lociFileChromCol', type=str,
# help='Chrom column (used to subset loci file). default CHR',required=False,default="CHR")
# parser.add_argument('--lociFileEffectAlleleCol', dest='lociFileEffectAlleleCol', type=str,
# help='Effect allele column (used to match alleles). default ALLELE1',required=False,default="ALLELE1")
# parser.add_argument('--summStatsDelim', metavar='str', dest='summStatsDelim', type=str, action='store', choices=["tab","comma","space","\\s"," ",",","\\t","\t"], default="tab",
# help="Use this delimiter when parsing summStats (default: tab).")
parser.add_argument('--verbose', '-v', action='count', default=0)
parser.add_argument('--quiet', '-q', action='count', default=0)
# parser.add_argument('integers', metavar='N', type=int, nargs='+',
# help='an integer for the accumulator')
# inArgs = '--chrom 22 --analysisName boltLMM_unrelateds_noPCsLinreg --lociFile ../ht_shr_leg_upperbody/z040_out_bgen_Height_all_unrel_noPCs_filtered_v2_prunedByDist.tsv'.split(" ")
inArgs = '--chrom 22 --analysisName Height_all --relatedness WB --sampleSize all'.split(" ") ##################
args = parser.parse_args(inArgs) ############################### testing ##################################
args = parser.parse_args()
print("args: %s " % args)
chrom = args.chrom
# prunedHeightSnps = pd.read_csv("../ht_shr_leg_upperbody/z040_out_bgen_Height_all_filtered_v2_prunedByDist.tsv",sep="\t")
# inputSSname = "UKB_all"
# prunedHeightSnps = pd.read_csv("/cvar/jhlab/ebartell/Public_GWAS_Summstats/sig_wood_height_pruned_loci_checkTurchinAnalysis.tsv",sep="\t")
# inputSSname = "Wood2014"
# if args.lociFile != "allSnps":
# print("loading summstats.")
# prunedHeightSnps = pd.read_csv(args.lociFile,sep="\t")
# inputSSname = args.analysisName
# if args.lociFileRsidCol != "SNP":
# prunedHeightSnps["SNP"] = prunedHeightSnps[args.lociFileRsidCol]
# if args.lociFileChromCol != "CHR":
# prunedHeightSnps["CHR"] = prunedHeightSnps[args.lociFileChromCol]
# if args.lociFileEffectAlleleCol != "ALLELE1":
# prunedHeightSnps["ALLELE1"] = prunedHeightSnps[args.lociFileEffectAlleleCol]
# prunedHeightSnps_thisChrom = prunedHeightSnps[prunedHeightSnps.CHR == chrom]
# prunedHeightSnps_thisChrom_rsids = list(prunedHeightSnps_thisChrom.SNP)
# print("%s gws variants." % len(prunedHeightSnps_thisChrom_rsids))
print("loading bgen.")
indexMapper = {}
contigRecoding = {}
chromString = "{"
for i in range(1,23):
indexMapper["/broad/ukbb/imputed_v3/ukb_imp_chr%s_v3.bgen" % i] = "/broad/jhlab_ukbiobank/users/ebartell/qfam/subsetBgens/ukb_imp_chr%s_v3_hail.bgen.idx2" % i
if i < 10:
contigRecoding["0%s" % i] = str(i)
# contigRecoding["%s" % i] = "0" + str(i)
else:
contigRecoding["%s" % i] = str(i)
chromString += "%s," % str(i)
chromString = chromString[:-1] + "}"
### very slow, only runs once. DONE ALREADY
# bgen_idx = hl.index_bgen("/broad/ukbb/imputed_v3/ukb_imp_chr%s_v3.bgen" % chromString, indexMapper,contig_recoding=contigRecoding)
### very fast
bgen = hl.import_bgen("/broad/ukbb/imputed_v3/ukb_imp_chr%s_v3.bgen" % chrom,
entry_fields=['GT', 'GP','dosage'], sample_file="/broad/jhlab_ukbiobank/PhenoData/LinkIds_ukb_500K_ImpData_v3_app11898_chrAUT.sample",
index_file_map=indexMapper)
print("load phenotypes")
#Load phenotype file, thru hail and also normally in python (easier to use, cuz I don't yet fully know all hail fxns)
phenoFile = "/broad/jhlab_ukbiobank/users/aravikum/prs_sim/prsice_outputs/" + args.analysisName
phenos = pd.read_csv(phenoFile,sep="\t")
colTypes = {"FID": hl.tstr,"IID":hl.tstr,"Noise_added":hl.tfloat64}
phenosHailTable = hl.import_table(phenoFile, impute=True,
types=colTypes).key_by('IID')
print("load covariates (including PCs)")
covariatesFile = "/broad/jhlab_ukbiobank/users/ebartell/z020_out_covariates.txt"
colTypesCov = {"FID": hl.tstr,"IID":hl.tstr,"ArrayNum":hl.tint32}
for i in range(1,11):
colTypesCov["PC%s" % i] = hl.tfloat64
covariatesHailTable = hl.import_table(covariatesFile,
types=colTypesCov).key_by('IID')
###################### Load dropped samples and drop them HERE
# print("load sample exclusions and drop them UNIMPLIMENTED")
print("loading individuals always removed.")
individualsToRemove = set()
drop1 = pd.read_csv("/broad/jhlab_ukbiobank/users/jcole/Pheno/bolt.in_plink_but_not_imputed.FID_IID.968.txt",sep=" ",header=None)
individualsToRemove |= set([str(x) for x in drop1[0]])
if args.relatedness in ["WB","unrelated"]:
drop2 = pd.read_csv("/broad/jhlab_ukbiobank/users/ebartell/bolt.in_v3_sample_but_not_UKBEURN455K.N32265_obtained3.12.19.txt",sep=" ",header=None)
individualsToRemove |= set([str(x) for x in drop2[0]])
if args.relatedness == "unrelated":
drop3 = pd.read_csv("/broad/jhlab_ukbiobank/users/ebartell/ht_shr_leg_upperbody/subsetGwas/samplesNotUnrelated_remove.csv",sep=" ",header=None)
individualsToRemove |= set([str(x) for x in drop3[0]])
if args.sampleSize != "all":
print("sample size argument %s is not implemented yet. running for all individuals.")
subsetSamplesFileRemove = ""
if args.sampleSize == "halfGwas1":
subsetSamplesFileRemove = "/broad/jhlab_ukbiobank/users/ebartell/ht_shr_leg_upperbody/subsetGwas/included_individuals_unrel-sample-fam-remove_firstHalfComplement.csv"
elif args.sampleSize == "halfGwas2":
subsetSamplesFileRemove = "/broad/jhlab_ukbiobank/users/ebartell/ht_shr_leg_upperbody/subsetGwas/included_individuals_unrel-sample-fam-remove_secondHalfComplement.csv"
elif args.sampleSize == "ckbN_halfGwas1":
subsetSamplesFileRemove = "/broad/jhlab_ukbiobank/users/ebartell/ht_shr_leg_upperbody/subsetGwas/included_individuals_unrel-sample-fam-remove_ckb1_complement.csv"
elif args.sampleSize == "ckbN_halfGwas2":
subsetSamplesFileRemove = "/broad/jhlab_ukbiobank/users/ebartell/ht_shr_leg_upperbody/subsetGwas/included_individuals_unrel-sample-fam-remove_ckb2_complement.csv"
elif args.sampleSize == "Q1_CKB-N-.5unrel":
subsetSamplesFileRemove = "/broad/jhlab_ukbiobank/users/ebartell/ht_shr_leg_upperbody/subsetGwas/included_individuals_unrel-sample-fam-remove-complement_CKB-N.5_1Of4Pieces.csv"
elif args.sampleSize == "Q2_CKB-N-.5unrel":
subsetSamplesFileRemove = "/broad/jhlab_ukbiobank/users/ebartell/ht_shr_leg_upperbody/subsetGwas/included_individuals_unrel-sample-fam-remove-complement_CKB-N.5_2Of4Pieces.csv"
elif args.sampleSize == "Q3_CKB-N-.5unrel":
subsetSamplesFileRemove = "/broad/jhlab_ukbiobank/users/ebartell/ht_shr_leg_upperbody/subsetGwas/included_individuals_unrel-sample-fam-remove-complement_CKB-N.5_3Of4Pieces.csv"
elif args.sampleSize == "Q4_CKB-N-.5unrel":
subsetSamplesFileRemove = "/broad/jhlab_ukbiobank/users/ebartell/ht_shr_leg_upperbody/subsetGwas/included_individuals_unrel-sample-fam-remove-complement_CKB-N.5_4Of4Pieces.csv"
elif args.sampleSize == "1of10unrel":
subsetSamplesFileRemove = "/broad/jhlab_ukbiobank/users/ebartell/ht_shr_leg_upperbody/subsetGwas/included_individuals_unrel-sample-fam-remove-complement_1Of10Pieces.csv"
elif args.sampleSize == "2of10unrel":
subsetSamplesFileRemove = "/broad/jhlab_ukbiobank/users/ebartell/ht_shr_leg_upperbody/subsetGwas/included_individuals_unrel-sample-fam-remove-complement_2Of10Pieces.csv"
elif args.sampleSize == "3of10unrel":
subsetSamplesFileRemove = "/broad/jhlab_ukbiobank/users/ebartell/ht_shr_leg_upperbody/subsetGwas/included_individuals_unrel-sample-fam-remove-complement_3Of10Pieces.csv"
elif args.sampleSize == "4of10unrel":
subsetSamplesFileRemove = "/broad/jhlab_ukbiobank/users/ebartell/ht_shr_leg_upperbody/subsetGwas/included_individuals_unrel-sample-fam-remove-complement_4Of10Pieces.csv"
elif args.sampleSize in set([str(x) in range(1,51)]): ##### 50 subsets
subsetSamplesFileRemove = "/broad/jhlab_ukbiobank/users/ebartell/ht_shr_leg_upperbody/subsetGwas/subsetSamples/included_individuals_unrel-sample-fam-remove-complement_%sOf50Pieces.csv" % args.sampleSize
analysisDirec="unrel50/"
else:
print("sample size argument %s is not implemented yet. running for all individuals.")
if subsetSamplesFileRemove != "":
drop3 = pd.read_csv(subsetSamplesFileRemove,sep=" ",header=None)
individualsToRemove |= set([str(x) for x in drop3[0]])
#Subset bgen to only related individuals.
remove_individuals_hl = hl.literal(set(individualsToRemove))
print("subset to relateds")
bgen = bgen.filter_cols(~remove_individuals_hl.contains(bgen.s))
bgen = bgen.annotate_cols(pheno = phenosHailTable[bgen.s])
bgen = bgen.annotate_cols(covariates = covariatesHailTable[bgen.s])
gwas = hl.linear_regression_rows(
y=bgen.pheno.Noise_added,
x=bgen.GT.n_alt_alleles(),
covariates=[1.0, bgen.covariates.PC1, bgen.covariates.PC2, bgen.covariates.PC3,bgen.covariates.PC4, bgen.covariates.PC5, bgen.covariates.PC6, bgen.covariates.PC7,
bgen.covariates.PC8, bgen.covariates.PC9, bgen.covariates.PC10, bgen.covariates.ArrayNum])
# p = hl.plot.manhattan(gwas.p_value)
# p.
# show(p)
gwas_pd = gwas.to_pandas()
def makeMarkerName(row):
alleles = row["alleles"]
alleles.sort()
return "%s:%s:%s" % (row["locus.contig"],row["locus.position"],":".join(alleles))
gwas_pd["SNP"] = gwas_pd.apply(makeMarkerName,axis=1)
gwas_pd.to_csv("hail/hailGWAS_%s_chrom%s_%s_%s_1core.tsv" % (args.analysisName,chrom,args.relatedness,args.sampleSize),sep='\t',index=False)