Hi - no this is 0.1. I would switch, but we’re in the middle of trying to meet a tight deadline and I’m reluctant to try to learn the new version…though I could be convinced!
Here’s the pipeline, as run on dataproc using Jupyter notebook (apologies in advance for length). The final block with the loop is where it falls apart:
# Load Hail
import hail
from hail import *
hc = HailContext()
# Load packages
import pandas as pd
import numpy as np
from math import log, isnan
import os,sys,glob,gzip
from pprint import pprint
# Load toy VDS
vds = hc.read('gs://donnawerlingucsf/devEQTL/filterPart2_toy-180402.vds')
%%bash
# Copy expression data table from GS to cluster
gsutil cp gs://donnawerlingucsf/devEQTL/misc_files/counts.gene.cleaned-to-Donna-02092018-transpose.tsv counts.gene.cleaned-to-Donna-02092018-transpose.tsv
# Import sample annotation table with full expression data as pandas data frame
exprdf = pd.read_csv('counts.gene.cleaned-to-Donna-02092018-transpose.tsv', sep='\t', index_col=0)
# Samples from VDS
sampsInVds = vds.sample_ids
# Subset the expression data frame
exprdf = exprdf.loc[sampsInVds,:]
# Calculate per-gene expression sums across this subsetted sample set
exprsums = exprdf.sum(axis=0)
# Find non-expressed genes
noExpr = exprsums[exprsums.le(0)]
# Remove these genes from the expression data
exprdf = exprdf.drop(noExpr.index, axis=1)
# Write this df out to file
exprdf.to_csv(path_or_buf='expressionFilt_176s_40707g-180411.txt', sep='\t')
%%bash
# Transfer file to GS
gsutil cp expressionFilt_176s_40707g-180411.txt gs://donnawerlingucsf/devEQTL/misc_files/
%%capture
# Re-load this expression data object as a keyTable
exprdat = hc.import_table('gs://donnawerlingucsf/devEQTL/misc_files/expressionFilt_176s_40707g-180411.txt', impute=True).key_by('Sample')
# Add expression to sample annotations for this VDS
vds = vds.annotate_samples_table(exprdat, root='sa.exprdat')
# Import sample annotation table with sample metadata information
metadat = hc.import_table('gs://donnawerlingucsf/devEQTL/misc_files/eQTL_metadata_forDex-format-180316.txt', impute=True).key_by('Braincode')
# Add metadata to sample annotations for this VDS
vds = vds.annotate_samples_table(metadat, root='sa.info')
# Add integer-coded variable for Sex
vds = vds.annotate_samples_expr('sa.info.sexCode = if (sa.info.Sex == "f") 1 else if (sa.info.Sex == "m") 0 else NA: Int')
# Import gene locations file
geneloc = hc.import_table('gs://donnawerlingucsf/devEQTL/misc_files/geneLocations.geneIdForHail.gencode.v21.txt', impute=True).key_by('geneId')
# Convert geneloc keyTable to pandas dataframe
genedf = geneloc.to_pandas()
%%capture
# Run the full loop, on all 40707 genes
res_table = pd.DataFrame()
for i in range (0, len(expr_genes)-1):
#
gene = '`' + expr_genes[i] + '`'
testPheno = 'sa.exprdat.' + gene
#
chrVar = list(genedf.loc[genedf['geneId'] == gene]['chr'])[0]
#
startVar = list(genedf.loc[genedf['geneId'] == gene]['start'])[0]
#
endVar = list(genedf.loc[genedf['geneId'] == gene]['end'])[0]
# Run linear regression for this gene
vds_res = vds.filter_intervals(Interval(Locus(chrVar, startVar-1000000), Locus(chrVar, endVar+1000000)))\
.linreg(testPheno, covariates=['sa.info.Period','sa.info.sexCode'])
# Pull out the results for this cis-eQTL analysis, filter to p<0.05, rename columns, convert to pandas data frame
this_table = vds_res.variants_table().flatten()\
.select(['v','va.rsid','va.linreg.beta','va.linreg.se','va.linreg.tstat','va.linreg.pval'])\
.filter("`va.linreg.pval` > 0.05", keep=False)\
.rename(['Variant','rsID','Beta','SE','Tstat','P'])\
.annotate('Variant = str(Variant)') \
.to_pandas(expand=False)
# Check - are there any significant results for this gene?
nSig = this_table.size
if nSig >= 1:
# Add gene name to this table
this_table['GeneId'] = gene.replace('`','')
this_table[['EnsemblID','GeneSymbol']] = this_table['GeneId'].str.split('|',expand=True)
# Split up the information in the variant column, re-order columns
this_table[['Chr','Start','Ref','Alt']] = this_table['Variant'].str.split(':',expand=True)
this_table = this_table[['GeneId','EnsemblID','GeneSymbol','Variant','Chr','Start','Ref','Alt','rsID','Beta','SE','Tstat','P']].head()
# Add results to existing master results table
res_table = pd.concat([res_table,this_table])