Hi,
Yes sure. Below the code that Iām using to do this analysis. It can break down into two main part: i) PCA in Hail and ii) training RF model and prediction in R. Here Iām using a previous approach that we have already developed for a previous work (https://doi.org/10.1371/journal.pone.0189875) which combine recursive feature elimination (here PCs) and RF. Ideally, it should be possible to have everything done in python for easier integration, but it requires extra work that Iām planning to do in the near future.
i) PCA in Hail
# read VDS 1kgenome-phase 3
vds_1kg = hc.read('/path/1000G_phase3/vds/vds_1kg_v3.vds')
# Preparing 1000genome data for PCA
# filtering
vds_filtered_1kg = vds_1kg.filter_multi() # filter out multi-allelic variants.
vds_filtered_1kg = vds_filtered_1kg.filter_variants_expr('v.altAllele().isSNP()', keep=True) # keep only SNPs
# read interval table (Purcell 5k)
purcell5k = KeyTable.import_bed('/path/1000G_phase3/purcell5k_intervals.bed')
vds_filtered_1kg = vds_filtered_1kg.\
filter_variants_table(purcell5k.key_by('interval'), keep=True) # keep overlapping variant
# Read target VDS
vds = hc.read('/path/wes_vcf_merged.vds')
# Filtering
vds = vds.filter_multi() # filter out multi-allelic variants.
vds = vds.filter_variants_expr('v.altAllele().isSNP()', keep=True) # keep only SNPs
# Merge dataset in overlapping SNPs (inner join)
vds_merged = vds.join(vds_filtered_1kg)
# computing variant QC on merged VDS
vds_merged = vds_merged.variant_qc().cache()
# Basic variant filtering before PCA
# Keep common SNPs (AF > 1%)
# LD pruning
common_vds = (vds_merged
.filter_variants_expr('va.qc.AF > 0.01')
.ld_prune(memory_per_core=512, num_cores=16))
# Perform PCA on merged VDS (getting 10 first PCs)
vds_pca = common_vds.pca('sa.pca', k=10, eigenvalues='global.eigen')
# Getting sample annotations as Pandas dataframe
pca_table = vds_pca.samples_table().to_pandas()
ii) Training/testing RF classifier and prediction
library(caret)
######################################################################################
# This script could be used for training a Random Forest classifier and predict
# Population ancestry using PCs as features. It has been tested after run PCA on ~5,000
# polymorphic SNPs (Purcell interval) on two 'merged' dataset: 1) 1000Genomes
# (used as reference since it contains sample population annotations) and 2)
# target dataset (actually the dataset which sample population are unknown). Both dataset
# were merged before running the PCA, thus, PCs will be used as features for training and predicting.
######################################################################################
# dataframe with samples, population and PCs (i.e. 10 first PC) from Hail workflow
pca_wes10k <- pca_table
# getting subset with known population info
training <- subset(pca_wes10k, sa.SuperPopulation %in% c("AFR","AMR","EAS","EUR","SAS"))
# Getting subset with unknown population information
# we are going to predict ancestry on this subset...
discovery <- subset(pca_wes10k, !sa.SuperPopulation %in% c("AFR","AMR","EAS","EUR","SAS"))
# Extract training features (Prinipal components)
features <- as.matrix(training[,c(6:15)])
# Extract response variable (Population info)
class <- as.vector(training$sa.SuperPopulation)
# Warning: if you are going to scale your data,
# do it before split it, otherwise, it will
# introduce a bias in the prediction phase, since
# you are using 'different' features for training the model
# and predicting new instances!
# Scale data features
# features <- scale(features, center=TRUE, scale=TRUE)
# define some workflow metrics
n <- 10 # folds to repit the entire workflow (on randomized test data)
nVars <- vector() # number of features in the final model
accv <- vector() # prediction accuracy per run
for(i in seq(1,n,1)){
# Divide the dataset in train and test sets
inTrain <- createDataPartition(as.factor(class), p = 2/3, list = FALSE)
# Create the training dataset
trainDescr <- features[inTrain,]
# Create the testing dataset
testDescr <- features[-inTrain,]
# create the training class subset
trainClass <- class[inTrain]
# create the testing class subset
testClass <- class[-inTrain]
#### recursive feature elimination plus random forest
rfProfile <- rfe(x = trainDescr,
y = as.factor(trainClass),
maximize = TRUE,
metric = 'Accuracy',
sizes = c(1:10),
rfeControl = rfeControl(functions = rfFuncs,
method = "cv",
number = 10,
verbose = TRUE))
## predict variable response (class) for all sammples with the new model
predictedClass <- predict(rfProfile, newdata = testDescr)
## compute prediction accuracy
accTable <- postResample(predictedClass, as.factor(testClass))
accv[i] <- accTable[['Accuracy']]
## keep best model
if (accv[length(accv)] >= max(accv)){
bestModel <- rfProfile
}
## retrive number of variable used
nVar <- rfProfile$optsize
nVars[i] <- nVar
}
## summary metrics
nVar_mean <- mean(nVars)
acc_mean <- mean(accv)
acc_sd <- sd(accv)
# Dataframe with variables metric (i.e. Accuracy) information
results <- bestModel$results
# plotting/saving RFE object
png("performance_vs_variables_RFE_RF.png", width = 800, height = 800)
plot.rfe(x = bestModel, xlab='Number of variables')
dev.off()
# Predict ancestry/population from discovery subset using the 'best classifier'
# Extract features (PCs) from discovery subset
unknown <- as.matrix(discovery[,c(6:15)])
# predict new Populations for unknown instances/samples
predicted.ancestry <- predict(bestModel, newdata = unknown)
# rename columns
names(predicted.ancestry) <- c('predictedPopulation',
'probability_AFR',
'probability_AMR',
'probability_EAS',
'probability_EUR',
'probability_SAS')
# add predicted Population to original data
# merge here by row index since it remains unchanged.
merged <- merge(discovery, predicted.ancestry, by=0, all = T)
# save file with sample population predicted
write.table(merged, file = 'population_predicted_random_forest.txt', row.names = FALSE, sep = '\t')