Lauren A. Baker, Mehdi Momen, Kore Chan, Fernando Brito Lopes, Nathan Bollig, Rory J. Todhunter, Guilherme J.M. Rosa, Emily E. Binversie, Susannah J. Sample and Peter Muir
rm(list = ls())
library("methods")
library("gaston")
library("BGLR")
library("pROC")
library("wsrf")
library("caret")
library("e1071")
library("xgboost")
library("Matrix")
library("plyr")
library("ranger")
library("randomGLM")
library("parallel")
library("dplyr")
library("ranger")
library("randomGLM")
library("parallel")
library("foreach")
library("data.table")
library("hmeasure")
library("naivebayes")
library("class")
options(warn = -1)
set.seed(1245)
setwd("~/Desktop/")
options(gaston.autosomes = 1:39)
GEN_List <- gaston::read.bed.matrix("wisc-cornell_cln")
## Reading wisc-cornell_cln.fam
## Reading wisc-cornell_cln.bim
## Reading wisc-cornell_cln.bed
## ped stats and snps stats have been set.
## 'p' has been set.
## 'mu' and 'sigma' have been set.
M <- as.matrix(GEN_List)
for (j in 1:ncol(M)){
M[,j] <- ifelse(is.na(M[,j]), mean(M[,j], na.rm=TRUE), M[,j])
}
Phen <- GEN_List@ped[,1:6]
Phen$Pheno <- Phen$pheno-1
Phen$sex <- as.factor(Phen$sex)
Phen$Pheno <- as.numeric(Phen$Pheno)
Covar <- read.table("wisc-cornell_covariates.txt",header = TRUE)
Covar$sex <- as.numeric(Covar$sex)
Covar$neuter <- as.numeric(Covar$neuter)
Pretiction_Sig_SNP <- list()
Pretiction_All_SNP <- list()
n_folds <- 2 # 10
n <- nrow(Phen)
CV.list <- list()
for(k in 1:n_folds){
input_ones <- Phen[which(Phen$Pheno== 1), ] # all 1's
input_zeros <- Phen[which(Phen$Pheno== 0), ] # all 0's
input_ones_training_rows <- sample(1:nrow(input_ones), 0.8*nrow(input_ones)) # 1's for training
input_zeros_training_rows <- sample(1:nrow(input_zeros), 0.8*nrow(input_ones)) # 0's for training
training_ones <- input_ones[input_ones_training_rows, ] # picking up 1's training
training_zeros <- input_zeros[input_zeros_training_rows, ] # picking up 0's training
train_size<- sum(length(training_ones$id)+length(training_zeros$id))
train_set <- cbind(rep("Trn",train_size),rbind(training_ones,training_zeros))
names(train_set)[1] <- "CV"
#----- Create Test Data
test_name <- rep("Tst",n-train_size)
test_ones <- input_ones[-input_ones_training_rows, ] # 1's for testing
test_zeros <- input_zeros[-input_zeros_training_rows, ] # 0's for testing
test_set <- cbind(test_name,rbind(test_ones, test_zeros)) # Merge the 1's and 0's
names(test_set)[1] <- "CV"
CV.list[[k]] <- rbind(train_set,test_set)
}
#------------------ Weighted RandomForest ------------------------##
weightedRFLearn <- function(training,test) {
mtryNum <- sqrt(ncol(training) - 1)
cr.wrf <- wsrf(Pheno ~ ., data = training, importance = TRUE,
ntree = 100,mtry=mtryNum, nodesize = 2)
predictions_trn <- predict(cr.wrf, training[-1],type=c("response", "aprob"))
wrf.prb.trn <- as.vector(round(predictions_trn$aprob[,2],digits = 3))
wrf.prb.tst_tst <- predict(cr.wrf, test[-1],type=c("response", "aprob"))
wrf.prb.tst <- as.vector(round(wrf.prb.tst_tst$aprob[,2],digits = 3))
AUC <- c("wRF",k,SNP[t],round(pROC::roc(test[,1],wrf.prb.tst)$auc,digits = 5))
return (list(trn=wrf.prb.trn,tst= wrf.prb.tst,AUC=AUC))
}
#----------------- xgboost ----------------------------------------##
xgbLearnAndImport <- function(training,test) {
bol <- sapply(training[1], as.numeric)
rd <- Matrix(data.matrix(training[-1]), sparse = TRUE)
cr.xgb <- xgboost(data = rd, label = bol, max_depth = 10, eval_metric = "error", subsample = 0.8, colsample_bytree = 0.8,
eta = 0.05, nrounds = 1000, objective = "binary:logistic", verbose = 0, gamma = 0.3)
data_trn <- Matrix(data.matrix(training[-1]), sparse = TRUE)
data_tst <- Matrix(data.matrix(test[-1]), sparse = TRUE)
xgb.prob.trn <- round(predict(cr.xgb, data_trn),digits = 3)
xgb.prob.tst <- round(predict(cr.xgb, data_tst),digits = 3)
AUC <- c("xgb",k,SNP[t],round(pROC::roc(test[,1],xgb.prob.tst)$auc,digits = 5))
return (list(trn=xgb.prob.trn,tst= xgb.prob.tst,AUC=AUC))
}
#---------------- naiveBayes ---------------------------------------##
naiveBayesLearn <- function(training,test) {
cr.nb <- gaussian_naive_bayes(x=as.matrix(training[,-1]), y=factor(training[,1]), laplace = 1)
nb.pred.trn <- predict(cr.nb, as.matrix(training[-1]), type = "prob")[,2]
nb.pred.tst <- predict(cr.nb, as.matrix(test[-1]), type = "prob")[,2]
AUC <- c("NB",k,SNP[t],round(pROC::roc(test[,1],nb.pred.tst)$auc,digits = 5))
return (list(trn=nb.pred.trn,tst= nb.pred.tst,AUC=AUC))
}
#--------------- knn ------------------------------------------------##
kNNLearn <- function(training,test) {
colnames(training) <- make.names(colnames(training))
cl <- factor(training[,1])
knn_trn <- knn(training[,-1],training[,-1],cl,k=8,prob = TRUE)
trn_prob <- round(attr(knn_trn, "prob"),digits = 3)
knn_tst <- knn(training[,-1],test[,-1],cl,k=8,prob = TRUE)
tst_prob <- round(attr(knn_tst, "prob"),digits = 3)
AUC <- c("KNN",k,SNP[t],round(pROC::roc(test[,1],tst_prob)$auc,digits = 5))
return (list(trn=trn_prob,tst= tst_prob,AUC=AUC))
}
#-------------------------------- n-Agreement ------------------------##
scores <- function(predictions,labels){
s <- cor.test(as.numeric(as.matrix(predictions)), as.numeric(as.matrix(labels)),
method = "spearman", alternative = "two.sided")
c <- confusionMatrix(table(pred = factor(predictions, levels = c(0:1)), actual = as.matrix(labels)), positive = "1")
h <- HMeasure(relabel(as.matrix(labels)), relabel(factor(predictions, levels = c(0:1))))$metrics
m <- list(s[[4]], c[[3]][1], c[[4]][11], h[1], h[11], h[12], h[3])
m
}
calc_agree_pos <- function(chunk){
num_agree = 0
for(i in 1:8) {
if(chunk[i] == 1) {
num_agree = num_agree + 1
}
}
chunk[11] <- num_agree
chunk[11]
}
standardize(GEN_List) <- 'p'
G <- GRM(GEN_List)
GWAS_List <- list()
Sig_SNP <- list()
Dif_SNP <- list()
for(k in 1:n_folds){
y <- data.frame(CV.list[[k]][which(CV.list[[k]]$CV=="Trn"),])
X <- model.matrix(Pheno ~ sex, data=y)
G_sub <- G[rownames(G)%in%y$id,colnames(G)%in%y$id]
GEN_List_sub <- select.inds(GEN_List, GEN_List@ped$id%in%y$id)
GWAS_Result <- association.test(GEN_List_sub,
Y=y$Pheno,
response="binary",
X=X,
K=G_sub,
method="lmm",
test="score",
verbose = getOption("gaston.verbose",FALSE))
GWAS_List[[k]] <- GWAS_Result[order(GWAS_Result$p),]
GEN_List_Diff <- data.frame(cbind(y$Pheno,M[rownames(M)%in%y$id,]))
colnames(GEN_List_Diff)[1]<- "y"
Case <- GEN_List_Diff[which(GEN_List_Diff$y=="1"),]
Cont <- GEN_List_Diff[which(GEN_List_Diff$y=="0"),]
freq.Case <- colMeans(Case[,-1])/2
freq.Cont <- colMeans(Cont[,-1])/2
meanDiff <- sort(abs(freq.Case-freq.Cont),decreasing = TRUE)
Dif_SNP[[k]] <- meanDiff
}
BM_AUC <- list()
ML_Diff_AUC <- list()
ML_GWAS_AUC <- list()
ENS_AUC<- list()
COV_AUC <- list()
# n_folds = 2 # 10
Bayes_model <- c("BRR")#,"BL","BayesA","BayesB","BayesC")
for(k in 1:n_folds){
start_time <- Sys.time()
#-----------------------------------#
# Bayesian Models
#-----------------------------------#
AUC <- matrix(NA,nrow = length(Bayes_model),ncol = 3)
for(m in 1:length(Bayes_model)){
model <- Bayes_model[m]
nIter= 2000 # 52000
burnIn= 500 # 6000
yBM <- CV.list[[k]][match(row.names(M),CV.list[[k]]$id),]
yBM[which(yBM$CV=="Tst"),"Pheno"] <- NA
index <- as.numeric(row.names(y[which(y$CV=="Tst"),]))
fm_BM <- BGLR(y=yBM$Pheno,
ETA=list(list(~sex+neuter+weight,data=Covar, model='FIXED'),
list(X=M, model=model)),
nIter=nIter,
burnIn=burnIn,
thin=2,
response_type="ordinal",
saveAt='BM_COV',verbose=FALSE)
index <- fm_BM$whichNa
GEBV_BM <- matrix(NA, nrow=length(index), ncol=5)
GEBV_BM[, 1] <- Phen[index, "id"]
GEBV_BM[, 2] <- rep(k, length(index))
GEBV_BM[, 3] <- Phen[index, "Pheno"]
GEBV_BM[, 4] <- as.numeric(ifelse(fm_BM$probs[index,2]> 0.5, 1, 0))
GEBV_BM[, 5] <- fm_BM$probs[index,2]
colnames(GEBV_BM) <- c("id", "k", "Pheno", "yHat", "probs")
AUC[m,] <- c(model,k,round(pROC::roc(Phen[index, "Pheno"],fm_BM$probs[index,2])$auc,digits = 5))
}
BM_AUC[[k]] <- AUC
#--------------------------------------------------------------------------------------------------------#
# Genomic Prediction using Machine Learning methods on only Significant SNP markers using mean difference
#--------------------------------------------------------------------------------------------------------#
SNP <- c(5,10,25)#,50,100,250,500,750,1000,2000,3000,4000,5000,7500,10000,12500,15000)
temp_ML_GWAS_AUC <- list()
temp_ML_Diff_AUC <- list()
temp_Ens_AUC <- list()
for(t in 1:length(SNP)){
mTrainData <- as.data.frame(cbind(Phen$Pheno[-index],Covar[-index,c(3,5,6)],M[-index,names(Dif_SNP[[k]])[1:SNP[t]]]))
mTestData <- as.data.frame(cbind(Phen$Pheno[index],Covar[index,c(3,5,6)],M[index,names(Dif_SNP[[k]])[1:SNP[t]]]))
colnames(mTrainData)[1] <- colnames(mTestData)[1] <- "Pheno"
wRF_1 <- weightedRFLearn(mTrainData, mTestData)
GBT_1 <- xgbLearnAndImport(mTrainData,mTestData)
NB_1 <- naiveBayesLearn(mTrainData,mTestData)
KNN_1 <- kNNLearn(mTrainData,mTestData)
# Ens_rf <- RFLearn(mTrainData,mTestData)
# Ens_rglm<- rglmLearn(mTrainData,mTestData)
#--------------------------------------------------------------------------------------------------------#
# Genomic Prediction using the most signifcant SNPs based on GWAS
#--------------------------------------------------------------------------------------------------------#
mTrainData <- as.data.frame(cbind(Phen$Pheno[-index],Covar[-index,c(3,5,6)],M[-index,GWAS_List[[k]]$id[1:SNP[t]]]))
mTestData <- as.data.frame(cbind(Phen$Pheno[index],Covar[index,c(3,5,6)],M[index,GWAS_List[[k]]$id[1:SNP[t]]]))
colnames(mTrainData)[1] <- colnames(mTestData)[1] <- "Pheno"
wRF_2 <- weightedRFLearn(mTrainData, mTestData)
GBT_2 <- xgbLearnAndImport(mTrainData,mTestData)
NB_2 <- naiveBayesLearn(mTrainData,mTestData)
KNN_2 <- kNNLearn(mTrainData,mTestData)
#AUC_ens_rf <- c("ens_rf",k,SNP[t],round(pROC::roc(ens.tst.Data[,1],predictions)$auc,digits = 5))
temp_ML_Diff_AUC[t] <- list(rbind(wRF_1$AUC,
GBT_1$AUC,
NB_1$AUC,
KNN_1$AUC
))
temp_ML_GWAS_AUC[t] <- list(rbind(wRF_2$AUC,
GBT_2$AUC,
NB_2$AUC,
KNN_2$AUC
))
##------------------------------------------------------------------##
# Ensemble Models
##------------------------------------------------------------------##
ens.trn.Data <- data.frame(mTrainData$Pheno,wRF_1$trn,GBT_1$trn,NB_1$trn,KNN_1$trn,wRF_2$trn,GBT_2$trn,NB_2$trn,KNN_2$trn)
ens.tst.Data <- data.frame(mTestData$Pheno,wRF_1$tst,GBT_1$tst,NB_1$tst,KNN_1$tst,wRF_2$tst,GBT_2$tst,NB_2$tst,KNN_2$tst)
colnames(ens.trn.Data) <- colnames(ens.tst.Data) <- c("Pheno","wRF1","GBT1","NB1","KNN1","wRF2","GBT2","NB2","KNN2")
ens_glm <- glm(Pheno ~ ., family = binomial, data = ens.trn.Data)
predictions <- round(predict.glm(ens_glm, newdata = ens.tst.Data, type = "response"),digits = 3)
ens_glm <- as.numeric(ifelse(predictions > 0.5, 2, 1))
AUC_ens_glm <- c("ens_glm",k,SNP[t],round(pROC::roc(ens.tst.Data[,1],predictions)$auc,digits = 5))
ens_rf <- wsrf(Pheno ~ ., family = binomial, data = ens.trn.Data, importance = TRUE, ntree = 1000, nodesize = 20)
predictions <- predict(ens_rf, ens.tst.Data[-1], type = "prob")$prob[,2]
AUC_ens_rf <- c("ens_rf",k,SNP[t],round(pROC::roc(ens.tst.Data[,1],predictions)$auc,digits = 5))
#-------------- n-Agreement -------------#
n=8
r <- data.table("gwRF_1"= as.numeric(ifelse(wRF_1$tst > 0.5, 1, 0)),
"gGBT_1"= as.numeric(ifelse(GBT_1$tst > 0.5, 1, 0)),
"gNB_1" = as.numeric(ifelse(NB_1$tst > 0.5, 1, 0)),
"gKNN_1"= as.numeric(ifelse(KNN_1$tst > 0.5, 1, 0)),
"gwRF_2"= as.numeric(ifelse(wRF_2$tst > 0.5, 1, 0)),
"gGBT_2"= as.numeric(ifelse(GBT_2$tst > 0.5, 1, 0)),
"gNB_2" = as.numeric(ifelse(NB_2$tst > 0.5, 1, 0)),
"gKNN_2"= as.numeric(ifelse(KNN_2$tst > 0.5, 1, 0))
)
num_agree_pos <- apply(FUN = calc_agree_pos,X=r, MARGIN = 1)
r_ensemble <- cbind(r, num_agree_pos)
class_based_on_agreement <- function(n){
logical <- as.numeric(as.numeric(as.character(r_ensemble$num_agree_pos)) >= n)
logical <- replace(logical, logical==1, 1)
replace(logical, logical==0, 0)
}
r_ensemble <- cbind(r_ensemble,
class_based_on_agreement(1),
class_based_on_agreement(2),
class_based_on_agreement(3),
class_based_on_agreement(4),
class_based_on_agreement(5),
class_based_on_agreement(6),
class_based_on_agreement(7),
class_based_on_agreement(8)
)
colnames(r_ensemble)[10:17] <- c("agree_1", "agree_2", "agree_3", "agree_4","agree_5", "agree_6", "agree_7", "agree_8")
ensemble_results <- list(r_ensemble$agree_1, r_ensemble$agree_2,
r_ensemble$agree_3, r_ensemble$agree_4,
r_ensemble$agree_5, r_ensemble$agree_6,
r_ensemble$agree_7, r_ensemble$agree_8)
metrics <- data.frame()
for(s in 1:8){
new_row <- as.data.frame(scores(ensemble_results[[s]], mTestData$Pheno))
colnames(new_row) <- c("spearman", "accuracy", "bal accuracy",
"H measure", "sensitivity", "specificity", "AUC")
metrics <- rbind(metrics, new_row)
}
rownames(metrics) <- c("1","2","3","4","5","6","7","8")
opt_auc <- c(paste0("nAgr:",rownames(metrics[which.max(metrics$AUC),])),k,SNP[t],
round(metrics$AUC[which.max(metrics$AUC)],digits = 4))
temp_Ens_AUC[t] <- list(rbind(AUC_ens_glm,AUC_ens_rf,opt_auc))
}
ML_Diff_AUC[k] <- list(do.call("rbind",temp_ML_Diff_AUC))
ML_GWAS_AUC[k] <- list(do.call("rbind",temp_ML_GWAS_AUC))
ENS_AUC[k] <- list(do.call("rbind",temp_Ens_AUC))
#------- -------------------------------------------------------------------------------------------#
# Genomic Prediction using logit model only on the covariates
#--------------------------------------------------------------------------------------------------#
logTrainData <- Covar[-index,]
log.TestData <- Covar[index,]
temp_COV_AUC <- list()
logit.model<- glm(pheno~ factor(sex)+factor(neuter)+weight,
family=binomial(link = "logit"),data=logTrainData)
logit.prob <- round(predict(logit.model,
log.TestData,type='response'),digits = 3)
logit.yhat <- ifelse(logit.prob > 0.5,1,0)
logit.res <- data.frame(Phen[index, "id"],
rep(k, length(index)),
Phen[index, "Pheno"],
logit.yhat,
logit.prob)
colnames(logit.res) <- c("id","k","Pheno","yhat","logit.prb")
COV_AUC[k] <- list(c("Logist",k,round(pROC::roc(log.TestData[,4],logit.prob)$auc,digits = 5)))
end_time <- Sys.time()
cat(end_time - start_time)
}
## 4.0252523.879208
write.table(do.call("rbind",BM_AUC), file = "BM_AUC.csv",quote = FALSE,sep = "\t",
row.names=FALSE, col.names=c("Model","Fold","AUC"))
write.table(do.call("rbind",ML_Diff_AUC), file = "ML_Diff_AUC.csv",quote = FALSE,sep = "\t",
row.names=FALSE, col.names=c("Model","Fold","SNP","AUC"))
write.table(do.call("rbind",ML_GWAS_AUC), file = "ML_GWAS_AUC.csv",quote = FALSE,sep = "\t",
row.names=FALSE, col.names=c("Model","Fold","SNP","AUC"))
write.table(do.call("rbind",COV_AUC), file = "COV_AUC.csv",quote = FALSE,sep = "\t",
row.names=FALSE, col.names=c("Model","Fold","AUC"))