--- title: "G2F BGLR Post Prediction Analysis" author: "Anna R. Rogers" date: "10/13/2020" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) #Load libraries library(Matrix) library(tidyverse) library(BGLR) library(png) library(grid) #library(ggpubr) library(doBy) library(knitr) library(leaflet) WorkDir <- "G:/My Drive/Anna/Prediction_G2F/" OutDir <- "G:/My Drive/Anna/Paper2_Figures_Code/Figures/" ``` ```{r Read in Heritability and other files that can be used for comparison of multiple analyses} #These are files that were used in Rogers et al. 2021 G3 "The Importance of Dominance and Genotype-by-Environment Interactions on Grain Yield Variation in a Large-Scale Public Cooperative Maize Experiment" Stage1_ModelingOut <- read.csv("G:/My Drive/Anna/asreml_3yrs/Stage 1 BLUEs/G2F_2014-16_Stage1_ModelInfo.csv", header = TRUE) Stage1_ModelingOut <- Stage1_ModelingOut[Stage1_ModelingOut$Trait == "Yield_Mg_ha", ] Stage1_ModelingOut <- Stage1_ModelingOut %>% select(Env, Verr, H.plot, H.mean) ``` # Question 1: What Marker Matrix size/composition is needed to account for genetic main effects? Models to run through: 1. Additive Marker Matrix Models * Additive 5k * Additive 10k * Additive 15k * Additive 20k 2. Dominance Marker Matrix Models * Dominance 5k * Dominance 10k * Dominance 15k * Dominance 20k 3. A + D Combos * A + D 5k * A + D 10k * A + D 15k * A + D 20k ```{r Functions for reading and tabulating data} Confirm_Complete <- function(folder){ #Confirm that model output was complete by checking the out files and err files from the HPC file_list <- list.files(path = paste0(WorkDir, folder)) out_files <- file_list[str_detect(file_list, pattern = "out.")] #Pick up the out files err_files <- file_list[str_detect(file_list, pattern = "err.")] #Pick up the err files out_df <- {} #empty place to store data for(i in 1:length(out_files)){ out_i <- readLines(con = paste0(WorkDir, folder, out_files[i])) exit_0 <- out_i[str_detect(out_i, pattern = "Successfully completed")] #Check if the exit code was 0 -> no errors if (length(exit_0) == 1) { #If the exit code is zero set out_complete to TRUE, else set to false out_complete <- TRUE } else out_complete <- FALSE err_i <- readLines(con = paste0(WorkDir, folder, err_files[i])) errs_detected <- err_i[str_detect(err_i, pattern = "Error")] if(length(errs_detected != 0)){ errs_present <- TRUE } else errs_present <- FALSE n_trn <- out_i[str_detect(out_i, pattern = "N-TRN")][1] n_test <- as.numeric(str_remove(n_trn, pattern = " N-TRN= .* N-TST= ")) n_trn <- as.numeric(str_remove(str_remove(n_trn, pattern = " N-TRN= "), pattern = " N-TST= .*")) tmp <- c(i, out_complete, errs_present, n_trn, n_test) out_df <- rbind(out_df, tmp) } out_df <- as.data.frame(out_df) names(out_df) <- c("fold", "out_complete", "errs_present", "n_trn", "n_test") out_df$out_complete <- as.logical(out_df$out_complete) out_df$errs_present <- as.logical(out_df$errs_present) #Return message to confirm output is correct return(out_df) } #Function to read in tabulated results Read_Tab_Results <- function(folder){ file_list <- list.files(path = paste0(WorkDir, folder)) #Grab the file list from the given folder #Grab the list of the tab result files tab_files <- file_list[str_detect(file_list, pattern = "tab_results.csv")] #Find the tabulated result files tab_results <- {} #Set up the empty frame to hold results for(i in 1:length(tab_files)){ tab_df <- read.csv(paste0(WorkDir, folder, tab_files[i]), header = TRUE) #Read in the ith result file tab_df$model <- str_remove(tab_files[i], pattern = "_tab_results.csv") tab_results <- rbind(tab_results, tab_df) #Bind the new input to the result frame } return(tab_results) #Return the tabulated results } #Function to read in the within env results Read_Within_Env_Results <- function(folder){ file_list <- list.files(path = paste0(WorkDir, folder)) #Grab the file list from the given folder W_Env_files <- file_list[str_detect(file_list, pattern = "WithinEnv_Results.csv")] #Find the within env result files W_Env_results <- {} #Set up an empty frame to hold results for(i in 1:length(W_Env_files)){ W_Env_df <- read.csv(paste0(WorkDir, folder, W_Env_files[i]), header = TRUE) #read in file i from the list W_Env_df$model <- str_remove(W_Env_files[i], pattern = "_WithinEnv_Results.csv") W_Env_results <- rbind(W_Env_results, W_Env_df) #Bind the new input to the result frame } return(W_Env_results) #Return the result df } #Function to read Predictions Read_Predictions <- function(folder){ file_list <- list.files(path = paste0(WorkDir, folder)) #Create the file list from the contents of the folder Pred_files <- file_list[str_detect(file_list, pattern = "Predictions.csv")] #Extract the files that are prediction files Predictions <- {} #Create an empty frame to store results for(i in 1:length(Pred_files)){ Pred_df <- read.csv(paste0(WorkDir, folder, Pred_files[i]), header = TRUE) #Read the ith Predictions file Pred_df$model <- str_remove(Pred_files[i], pattern = "_Predictions.csv") Predictions <- rbind(Predictions, Pred_df) } return(Predictions) } Bayes_Diagnostic_Check <- function(folder, params = c("mu", "varE"), folds = 10){ file_list <- list.files(path = paste0(WorkDir, folder)) for(i in 1:folds){ fold_files <- file_list[str_detect(file_list, pattern = paste0("fold_", i, "_"))] for(param_i in params){ param_files <- fold_files[str_detect(fold_files, pattern = param_i)] param_vals <- read.table(paste0(WorkDir, folder, param_files), col.names = "value") param_vals$iter <- seq(from = 1, to = 12000, by = 10) png(filename = paste0(WorkDir, folder, "Fold_", i, param_i, ".png"), width = 600, height = 400, units = "px") plot(x = param_vals$iter, y = param_vals$value, type = "l") + title(main = paste0("Fold ", i, " Parameter ", param_i)) dev.off() } } return(paste0("Output diagnostic images have been save at ", WorkDir, folder, ".")) } merge_pngs <- function(png_folder = folder, WorkDir = WorkDir, OutDir = OutDir){ file_list <- list.files(path = paste0(WorkDir, png_folder)) png_list <- file_list[str_detect(file_list, "png")] pdf(file = paste0(WorkDir, png_folder, "Diagnostic_Plots.pdf")) lapply(png_list, function(x){ img <- as.raster(readPNG(paste0(WorkDir, png_folder, x))) grid.newpage() grid.raster(img, interpolate = FALSE) }) dev.off() return(print(paste0("Output PDF is in ", WorkDir, png_folder, "Diagnostic_Plots.pdf"))) } Compute_Within_Env_Statistics <- function(Within_Env_Data, Bias_Data, Heritability_Data = Stage1_ModelingOut){ W_Env_Means <- Within_Env_Data %>% group_by(Env) %>% summarise(Mean_PredAbi = mean(PredAbi), SD_PredAbi = sd(PredAbi), Mean_RankCor = mean(RankCor), SD_RankCor = sd(RankCor), Max_PredAbi = max(PredAbi), Min_PredAbi = min(PredAbi)) Bias_Means <- Bias_Data %>% group_by(Env) %>% summarise(mean_bias = mean(bias)) W_Env_Means <- Heritability_Data %>% select(Env, H.mean) %>% right_join(y = W_Env_Means) %>% left_join(y = Bias_Means) return(W_Env_Means) } Compute_Bias_by_Env <- function(prediction_data){ W_Env_Bias <- Prediction_Data %>% group_by(Env, Fold) %>% summarise(mean_obs = mean(Phenotype, na.rm = TRUE), mean_pred = mean(GEBV, na.rm = TRUE)) %>% mutate(bias = mean_obs - mean_pred) return(W_Env_Bias) } Extract_Loss <- function(folder_list){ #Make Empty places to store things running_tab <- {} running_out <- {} for(i in 1:length(folder_list)){ print(paste0("Working on folder ", i, ": ", folder_list[i])) #First read in the relevant tab results and out results tab_i <- Read_Tab_Results(paste0(folder_list[i], "/")) out_i <- Confirm_Complete(paste0(folder_list[i], "/")) tab_i <- left_join(tab_i, select(out_i, "n_trn", "n_test", "fold")) %>% mutate(Sampling_Scheme = str_remove(tab_i$model,pattern = "_fold_.*")) #Find a close size of baseline to compare to for(j in 1:nrow(tab_i)){ base_samp <- abs(tab_i$n_trn[j] - GxE_Base$n_trn) which_base <- which.minn(base_samp, n = 10) tab_i$Base_Accuracy[j] <- mean(GxE_Base$PredAbi[which_base]) } #mean_sample_size <- mean(tab_i$n_trn, na.rm = TRUE) #base_means <- abs(mean_sample_size - GxE_Base$n_trn) #compute abs difference from the current schemas training size #which_base <- which.minn(base_means, n = 10) #Figure out which base runs to use for comparison #tab_i$Base_Accuracy <- mean(GxE_Base$PredAbi[which_base]) tab_i$Loss <- tab_i$PredAbi - tab_i$Base_Accuracy #bind the tab and out to their respective results running_tab <- rbind(running_tab, tab_i) running_out <- rbind(running_out, out_i) } #Coerce results to list for output running_tab <- list(tab = running_tab, out = running_out) return(running_tab) } Extract_Within_Env <- function(folder_list){ #Set up empty frames to put things in running_WEnv <- {} running_out <- {} #Figure out how many obs per environment load(paste0(WorkDir, "R_data_objects/PhenoG2F.Rdata")) Obs_per_Env <- PhenoG2F %>% count(Env) for(i in 1:length(folder_list)){ print(paste0("Working on folder ", i, ": ", folder_list[i])) #First read in the relevant tab results and out results W_env_i <- Read_Within_Env_Results(paste0(folder_list[i], "/")) Preds_i <- Read_Predictions(paste0(folder_list[i], "/")) n_test <- Preds_i %>% count(Env, Fold) %>% rename(w_env_n_test = n, fold = Fold) #Figure out how many obs in test vs train Obs_per_Env_i <- n_test %>% left_join(y = Obs_per_Env, by = "Env") %>% select(Env, fold, n, w_env_n_test) %>% mutate(w_env_n_trn = n - w_env_n_test) out_i <- Confirm_Complete(paste0(folder_list[i], "/")) #Join the train test info to the within env info W_env_i <- W_env_i %>% left_join(y = Obs_per_Env_i, by = c("Env", "fold")) %>% mutate(Sampling_Scheme = str_remove(W_env_i$model, pattern = "_fold_.*")) W_env_i <- W_env_i %>% left_join(select(out_i, -out_complete, -errs_present), by = "fold") #Compute Mean Predicted and Observed values Mean_Summ <- Preds_i %>% group_by(Env) %>% summarise(Mean_Obs = mean(Phenotype, na.rm = TRUE), Mean_Pred = mean(GEBV, na.rm = TRUE)) W_env_i <- W_env_i %>% left_join(y = Mean_Summ, by = "Env") w_env_base_comp <- {} #Compare to a scheme with same across env training size -doesn't need to be within Env train size being the same. for(j in 1:length(unique(W_env_i$fold))){ fold_j <- j print(paste0("Working on fold ", j, " of ", length(unique(W_env_i$fold)), ".")) trn_size <- W_env_i$n_trn[W_env_i$fold == j][1] base_samp <- abs(trn_size - GxE_Base$n_trn) which_base <- which.minn(base_samp, n = 10) which_comp <- GxE_Base[which_base, ] %>% select(fold, n_trn, n_test, folder, model) sub_comp <- base_within_env %>% right_join(which_comp) sub_comp <- sub_comp %>% group_by(Env) %>% summarise(mean_pred_abi = mean(PredAbi, na.rm = TRUE), sd_pred_abi = sd(PredAbi, na.rm = TRUE)) sub_comp$n_trn <- trn_size w_env_base_comp <- rbind(w_env_base_comp, sub_comp) } W_env_i <- W_env_i %>% left_join(w_env_base_comp) %>% mutate(Loss = PredAbi - mean_pred_abi, Bias = Mean_Pred - Mean_Obs) #Loss is the predictive ability - mean pred ability from comparable sample. so a negative value indicates decreased predictive accuracy, a positive value indicates gain in predictive accuracy #bind the tab and out to their respective results running_WEnv <- rbind(running_WEnv, W_env_i) running_out <- rbind(running_out, out_i) } #Coerce results to list for output running_WEnv <- list(tab = running_WEnv, out = running_out) return(running_WEnv) } #Function to load all output from a folder into a list object Load_Output <- function(folder){ Output <- list() Loaded_out <- Confirm_Complete(folder) Loaded_tab_res <- Read_Tab_Results(folder) Loaded_W_Env <- Read_Within_Env_Results(folder) Loaded_Predictions <- Read_Predictions(folder) Output <- list(out = Loaded_out, tab_res = Loaded_tab_res, W_Env = Loaded_W_Env, Predictions = Loaded_Predictions) return(Output) } Read_Folders <- function(folder_list){ Compiled_Out <- list(out = NULL, tab_res = NULL, W_Env = NULL, Predictions = NULL) for(folder_i in 1:length(folder_list)){ cat(paste0("Working on folder:", folder_list[folder_i], "."), "\n") out_i <- Load_Output(folder_list[folder_i]) out_i$tab_res <- left_join(out_i$tab_res, select(out_i$out, "fold", "n_trn", "n_test")) Compiled_Out$out <- rbind(Compiled_Out$out, out_i$out) Compiled_Out$tab_res <- rbind(Compiled_Out$tab_res, out_i$tab_res) Compiled_Out$W_Env <- rbind(Compiled_Out$W_Env, out_i$W_Env) Compiled_Out$Predictions <- rbind(Compiled_Out$Predictions, out_i$Predictions) } return(Compiled_Out) } Create_Folder_List <- function(WorkDir, Data_Dir){ #A function to list folders for input in loading multiple folders of data from a given data directory. folder_list <- list.dirs(paste0(WorkDir, Data_Dir))[-1] folder_list <- str_remove(folder_list, WorkDir) folder_list <- str_replace(folder_list, pattern = "//", replacement = "/") folder_list <- paste0(folder_list, "/") return(folder_list) } Model_Type <- function(output_list, given_method){ #function to attach model info for CV method and what genre of model this is to the output #arguments: #output_list: the list output from using the Read_Folders function or a Load_Output function #given method: user input string, what kind of model was run (ex. "G + E") #Extract the cross validation method from the model column output_list$tab_res$CV_Method <- str_remove(output_list$tab_res$model, pattern = "_.*") output_list$W_Env$CV_Method <- str_remove(output_list$W_Env$model, pattern = "_.*") output_list$Predictions$CV_Method <- str_remove(output_list$Predictions$model, pattern = "_.*") #Input the model type column output_list$tab_res$Type <- given_method output_list$W_Env$Type <- given_method output_list$Predictions$Type <- given_method return(output_list) } ``` ## Additive Results ```{r Additive 5k Results} folder <- "Marker_Choice_Model_Out/Additive_5k/" #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB")) merge_pngs(png_folder = folder, WorkDir = WorkDir) A_5k <- Load_Output(folder) A_5k$Predictions$Fold <- factor(A_5k$Predictions$Fold, levels = c(1:10)) ggplot(A_5k$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ```{r Additive 10k Results} folder <- "Marker_Choice_Model_Out/Additive_10k/" #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB")) merge_pngs(png_folder = folder, WorkDir = WorkDir) A_10k <- Load_Output("Marker_choice_Model_Out/Additive_10k/") A_10k$Predictions$Fold <- factor(A_10k$Predictions$Fold, levels = c(1:10)) ggplot(A_10k$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ```{r Additive 15k Results} folder <- "Marker_Choice_Model_Out/Additive_15k/" #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB")) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data A_15k <- Load_Output(folder) A_15k$Predictions$Fold <- factor(A_15k$Predictions$Fold, levels = c(1:10)) ggplot(A_15k$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ```{r Additive 20k Results} folder <- "Marker_Choice_Model_Out/Additive_20k/" #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB")) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data A_20k <- Load_Output(folder) A_20k$Predictions$Fold <- factor(A_20k$Predictions$Fold, levels = c(1:10)) ggplot(A_20k$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` **Make better plots later Anna** ```{r combine additive tab_results} tab_combined <- bind_rows(A_5k$tab, A_10k$tab, A_15k$tab, A_20k$tab) tab_combined$model <- str_remove(str_remove(tab_combined$model, pattern = "_fold_[0-9]{1,2}"), pattern = "CV1_") ggplot(tab_combined, aes(x = model, y = PredAbi)) + geom_point() ``` ## Dominance Results ```{r Dominance 5k Results} folder <- "Marker_Choice_Model_Out/Dominance_5k/" #Check that the run was complete D_5k_out <- Confirm_Complete(folder) #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB")) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data D_5k <- Load_Output(folder) D_5k$Predictions$Fold <- factor(D_5k$Predictions$Fold, levels = c(1:10)) ggplot(D_5k$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ```{r Dominance 10k Results} folder <- "Marker_Choice_Model_Out/Dominance_10k/" #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB")) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data D_10k <- Load_Output(folder) D_10k$Predictions$Fold <- factor(D_10k$Predictions$Fold, levels = c(1:10)) ggplot(D_10k$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ```{r Dominance 15k Results} folder <- "Marker_Choice_Model_Out/Dominance_15k/" #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB")) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data D_15k <- Load_Output(folder) D_15k$Predictions$Fold <- factor(D_15k$Predictions$Fold, levels = c(1:10)) ggplot(D_15k$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ```{r Dominance 20k Results} folder <- "Marker_Choice_Model_Out/Dominance_20k/" #Check that the run was complete D_20k_out <- Confirm_Complete(folder) #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB")) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data D_20k <- Load_Output(folder) D_20k$Predictions$Fold <- factor(D_20k$Predictions$Fold, levels = c(1:10)) ggplot(D_20k$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ```{r} tab_combined <- bind_rows(A_5k$tab, A_10k$tab, A_15k$tab, A_20k$tab, D_5k$tab, D_10k$tab, D_15k$tab, D_20k$tab) tab_combined$model <- str_remove(str_remove(tab_combined$model, pattern = "_fold_[0-9]{1,2}"), pattern = "CV1_") tab_combined$model <- factor(tab_combined$model, levels = c("A_5k", "A_10k", "A_15k", "A_20k", "D_5k", "D_10k", "D_15k", "D_20k", "A+D_5k", "A+D_10k", "A+D_15k", "A+D_20k")) tab_combined$G_Term <- str_extract(tab_combined$model, "[A-Z]") tab_combined$Marker_Num <- as.numeric(str_extract(tab_combined$model, "[0-9]{1,2}")) * 1000 ggplot(tab_combined, aes(x = Marker_Num, y = PredAbi, color = G_Term, group = Marker_Num)) + geom_point() ggplot(tab_combined, aes(x = model, y = PredAbi, color = G_Term)) + geom_boxplot() + geom_point() ``` ## A + D Results ```{r A D 5k} folder <- "Marker_Choice_Model_Out/A_D_5k/" #Check that the run was complete A_D_5k_out <- Confirm_Complete(folder) #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Additive_varB", "ETA_Dominance_varB")) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data A_D_5k <- Load_Output(folder) A_D_5k$Predictions$Fold <- factor(A_D_5k$Predictions$Fold, levels = c(1:10)) ggplot(A_D_5k$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ```{r A D 10k} folder <- "Marker_Choice_Model_Out/A_D_10k/" #Check that the run was complete A_D_10k_out <- Confirm_Complete(folder) #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Additive_varB", "ETA_Dominance_varB")) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data A_D_10k<- Load_Output(folder) A_D_10k$Predictions$Fold <- factor(A_D_10k$Predictions$Fold, levels = c(1:10)) ggplot(A_D_10k$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ```{r A D 15k} folder <- "Marker_Choice_Model_Out/A_D_15k/" #Check that the run was complete A_D_15k_out <- Confirm_Complete(folder) #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Additive_varB", "ETA_Dominance_varB")) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data A_D_15k <- Load_Output(folder) A_D_15k$Predictions$Fold <- factor(A_D_15k$Predictions$Fold, levels = c(1:10)) ggplot(A_D_15k$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ```{r A D 20k} folder <- "Marker_Choice_Model_Out/A_D_20k/" #Check that the run was complete A_D_20k_out <- Confirm_Complete(folder) #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Additive_varB", "ETA_Dominance_varB")) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data A_D_20k <- Load_Output(folder) A_D_20k$Predictions$Fold <- factor(A_D_20k$Predictions$Fold, levels = c(1:10)) ggplot(A_D_20k$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ## Weighted Dominance Results ```{r Dominance 5k Weighted Results} folder <- "Marker_Choice_Model_Out/Dominance_5k_W/" #Check that the run was complete DW_5k_out <- Confirm_Complete(folder) #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB")) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data DW_5k <- Load_Output(folder) DW_5k$Predictions$Fold <- factor(DW_5k$Predictions$Fold, levels = c(1:10)) ggplot(DW_5k$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ```{r Dominance 10k Weighted Results} folder <- "Marker_Choice_Model_Out/Dominance_10k_W/" #Check that the run was complete DW_10k_out <- Confirm_Complete(folder) #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB")) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data DW_10k <- Load_Output(folder) DW_10k$Predictions$Fold <- factor(DW_10k$Predictions$Fold, levels = c(1:10)) ggplot(DW_10k$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ```{r Dominance 15k Weighted Results} folder <- "Marker_Choice_Model_Out/Dominance_15k_W/" #Check that the run was complete DW_15k_out <- Confirm_Complete(folder) #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB")) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data DW_15k <- Load_Output(folder) DW_15k$Predictions$Fold <- factor(DW_15k$Predictions$Fold, levels = c(1:10)) ggplot(DW_15k$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ```{r Dominance 20k Weighted Results} folder <- "Marker_Choice_Model_Out/Dominance_20k_W/" #Check that the run was complete DW_20k_out <- Confirm_Complete(folder) #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB")) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data DW_20k <- Load_Output(folder) DW_20k$Predictions$Fold <- factor(DW_20k$Predictions$Fold, levels = c(1:10)) ggplot(DW_20k$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ```{r Combined Tabulation of Marker Choice models} tab_combined <- bind_rows(A_5k$tab, A_10k$tab, A_15k$tab, A_20k$tab, D_5k$tab, D_10k$tab, D_15k$tab, D_20k$tab, A_D_5k$tab, A_D_10k$tab, A_D_15k$tab, A_D_20k$tab) #, DW_5k$tab, DW_10k$tab, DW_15k$tab, DW_20k$tab) tab_combined$model <- str_remove(str_remove(tab_combined$model, pattern = "_fold_[0-9]{1,2}"), pattern = "CV1_") tab_combined$model <- factor(tab_combined$model, levels = c("A_5k", "A_10k", "A_15k", "A_20k", "D_5k", "D_10k", "D_15k", "D_20k", "A+D_5k", "A+D_10k", "A+D_15k", "A+D_20k"))#, "D_5k_W", "D_10k_W", "D_15k_W", "D_20k_W")) tab_combined$G_Term <- str_remove(tab_combined$model, "_[0-9]{1,2}k") tab_combined$G_Term[tab_combined$G_Term == "A"] <- "Additive" tab_combined$G_Term[tab_combined$G_Term == "D"] <- "Dominance" tab_combined$Marker_Num <- as.numeric(str_extract(tab_combined$model, "[0-9]{1,2}")) * 1000 ggplot(tab_combined, aes(x = Marker_Num, y = PredAbi, color = G_Term, group = Marker_Num)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ggplot(tab_combined, aes(x = model, y = PredAbi, fill = G_Term)) + geom_boxplot(alpha = 0.6) + geom_point() + scale_color_brewer(palette = "Spectral") + scale_fill_brewer(palette = "Spectral") + theme_bw() tab_summarise <- tab_combined %>% group_by(model) %>% summarise(Mean_PredAbi = mean(PredAbi), Max_PredAbi = max(PredAbi), Min_PredAbi = min(PredAbi)) %>% left_join(distinct(select(tab_combined, model, G_Term, Marker_Num))) png(filename = paste0(WorkDir, "Mean_Predictive_Acc_Markers.png"), width = 1000, height = 750, units = "px") ggplot(tab_summarise, aes(x = Marker_Num, y = Mean_PredAbi, color = G_Term)) + geom_line(lwd = 2) + geom_point() + scale_color_brewer(palette = "Paired") + theme_bw() + theme(legend.title = element_text(face = "bold", size = 20, color = "black"), legend.text = element_text(size = 18, color = "black"), axis.text = element_text(size = 18, color = "black"), axis.title = element_text(size = 18, color = "black"), plot.background = element_rect(color = "white", fill = "white"), legend.background = element_rect(color = "white", fill = "white"), panel.grid = element_line(color = "gray60")) dev.off() png(filename = paste0(WorkDir, "Box_Predictive_Acc_Markers_PPT.png"), width = 1000, height = 750, units = "px") ggplot(tab_combined, aes(x = model, y = PredAbi, fill = G_Term)) + geom_boxplot(alpha = 0.6) + xlab("Model") + ylab("Predictive Ability") + geom_point() + #geom_point(data = tab_summarise, aes(x = model, y = Mean_PredAbi, ), cex = 5, alpha = 1, shape = 9) + scale_color_brewer(palette = "Paired") + scale_fill_brewer(palette = "Paired") + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 14, color = "white"), axis.text.y = element_text(size = 14, color = "white"), title = element_text(face = "bold", size = 18, color = "white"), axis.title = element_text(face = "bold", size = 16, color = "white"), plot.background = element_rect(fill = "black"), legend.background = element_rect(fill = "black"), legend.text = element_text(color = "white", size = 14)) + scale_x_discrete(labels = c("A_5k" = "Additive, 5k", "A_10k" = "Additive, 10k", "A_15k" ="Additive, 15k", "A_20k" ="Additive, 20k", "D_5k" ="Dominance, 5k", "D_10k" ="Dominance, 10k", "D_15k" ="Dominance, 15k", "D_20k" ="Dominance, 20k", "A+D_5k" ="A + D, 5k", "A+D_10k" ="A + D, 10k", "A+D_15k" ="A + D, 15k", "A+D_20k" ="A + D, 20k")) dev.off() A_D_comp <- tab_combined[tab_combined$G_Term != "A+D", ] A_D_comp <- A_D_comp %>% select(fold, G_Term, Marker_Num, PredAbi) %>% pivot_wider(names_from = c("G_Term"), values_from = "PredAbi") A_D_comp$diff <- A_D_comp$D - A_D_comp$A summary(A_D_comp$diff) t.test(A_D_comp$A, A_D_comp$D) Pred_Comp <- tab_combined %>% select(fold, G_Term, Marker_Num, PredAbi) %>% pivot_wider(names_from = c("G_Term"), values_from = "PredAbi") t.test(Pred_Comp$A, Pred_Comp$D) t.test(Pred_Comp$D, Pred_Comp$`A+D`) mean(Pred_Comp$D[Pred_Comp$Marker_Num == 10000] - Pred_Comp$D[Pred_Comp$Marker_Num == 5000]) mean(Pred_Comp$D[Pred_Comp$Marker_Num == 15000] - Pred_Comp$D[Pred_Comp$Marker_Num == 10000]) mean(Pred_Comp$D[Pred_Comp$Marker_Num == 20000] - Pred_Comp$D[Pred_Comp$Marker_Num == 15000]) ``` From the above output we can see that the dominance matrix clearly results in better predictions than the additive matrix, and that fitting both A and D doesn't add much to the model's predictive accuracy. When we fit markers moving from 5k to 20k, we get a clear increase in predictive accuracy moving from 5k to 10k markers, but adding beyond the 10k markers doesn't result in great increases in accuracy. From here we will continue onto adding the Env main effects to our model using the Dominance 10k matrix as our marker matrix. #Question 2: What set of environmental covariates provides the best estimation of the environmental main effects? Be sure to use a leave environment out or leave one year out type of cross validation for this method - Models to run through: 1. Env Labels 2. 5 Day Weather Covariates 3. 10 Day Weather Covariates 4. 15 Day Weather Covariates 5. 30 Day Weather Covariates ##Leave One Year Out ```{r LOYO Env Label Model} folder <- "Env_Choice/LOYCV/D_10k_EnvLabels/" #Check diagnostics and output plots #Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB"), folds = 3) #merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data D_EnvLabels <- Load_Output(folder) D_EnvLabels$Predictions$Fold <- factor(D_EnvLabels$Predictions$Fold + 2013, levels = c(2014:2016)) ggplot(D_EnvLabels$W_Env, aes(x = Best10, y = PredAbi, color = as.character(fold + 2013))) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ```{r LOYO Five Day Weather} folder <- "Env_Choice/LOYCV/D_10k_5Day/" #Check that the run was complete #Check diagnostics and output plots #Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB", "ETA_Env_lambda"), folds = 3) #merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data D_10k_5Day <- Load_Output(folder) D_10k_5Day$Sum <- D_10k_5Day$W_Env %>% group_by(fold) %>% summarise(mean_pred_abi = mean(PredAbi)) D_10k_5Day$Predictions$Fold <- factor(D_10k_5Day$Predictions$Fold, levels = c(1:3)) #ggplot(D_10k_5Day$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() #ggplot(D_10k_5Day$W_Env, aes(x = Env, y = PredAbi)) + geom_boxplot() + geom_point() #ggplot(D_10k_5Day$W_Env, aes(x = fold, y = PredAbi)) + geom_point() #Should see no correlation between fold and predictive accuracy since folds are random ``` ```{r LOYO Ten Day Model} folder <- "Env_Choice/LOYCV/D_10k_10Day/" #Check diagnostics and output plots #Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB", "ETA_Env_lambda"), folds = 3) #merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data D_10k_10Day <- Load_Output(folder) D_10k_10Day$Predictions$Fold <- factor(D_10k_10Day$Predictions$Fold, levels = c(1:3)) #ggplot(D_10k_10Day$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() #ggplot(D_10k_10Day_W_Env, aes(x = Env, y = PredAbi)) + geom_boxplot() + geom_point() #ggplot(D_10k_10Day_W_Env, aes(x = fold, y = PredAbi)) + geom_point() #Should see no correlation between fold and predictive accuracy since folds are random ``` ```{r LOYO Fifteen Day Weather} folder <- "Env_Choice/LOYCV/D_10k_15Day/" #Check diagnostics and output plots #Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB", "ETA_Env_lambda"), folds = 3) #merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data D_10k_15Day <- Load_Output(folder) D_10k_15Day$Predictions$Fold <- factor(D_10k_15Day$Predictions$Fold, levels = c(1:3)) #ggplot(D_10k_15Day$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() #ggplot(D_10k_15Day_W_Env, aes(x = Env, y = PredAbi)) + geom_boxplot() + geom_point() #ggplot(D_10k_15Day_W_Env, aes(x = fold, y = PredAbi)) + geom_point() #Should see no correlation between fold and predictive accuracy since folds are random ``` ```{r LOYO Thirty Day Model} folder <- "Env_Choice/LOYCV/D_10k_30Day/" #Check diagnostics and output plots #Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB", "ETA_Env_lambda"), folds = 3) #merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data D_10k_30Day <- Load_Output(folder) D_10k_30Day$Predictions$Fold <- factor(D_10k_30Day$Predictions$Fold, levels = c(1:3)) #ggplot(D_10k_30Day$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() #ggplot(D_10k_30Day_W_Env, aes(x = Env, y = PredAbi)) + geom_boxplot() + geom_point() #ggplot(D_10k_30Day_W_Env, aes(x = fold, y = PredAbi)) + geom_point() #Should see no correlation between fold and predictive accuracy since folds are random ``` ```{r LOYO Rough Tabulated Analysis} Env_tab_res <- bind_rows(D_EnvLabels$tab_res, D_10k_5Day$tab_res, D_10k_10Day$tab_res, D_10k_15Day$tab_res, D_10k_30Day$tab_res) #, DW_5k$tab_res, DW_10k$tab_res, DW_15k$tab_res, DW_20k$tab_res) Env_tab_res$model <- str_remove(str_remove(Env_tab_res$model, pattern = "_fold_[0-9]{1,2}"), pattern = "CV1_") Env_tab_res$model <- factor(Env_tab_res$model, levels = c("D_10k_EnvLabel", "D_10k_5Day", "D_10k_10Day", "D_10k_15Day", "D_10k_30Day"))#, "D_5k_W", "D_10k_W", "D_15k_W", "D_20k_W")) Env_tab_res$Env_Term <- factor(str_remove(Env_tab_res$model, "D_10k_"), levels = c("EnvLabel", "5Day", "10Day", "15Day", "30Day")) Env_tab_res$Window <- as.numeric(str_remove(Env_tab_res$Env_Term, "Day")) Env_wide <- Env_tab_res %>% select(Env_Term, fold, PredAbi) %>% pivot_wider(id_cols = c("fold"), values_from = "PredAbi", names_from = "Env_Term") kable(Env_wide) RankCor_wide <- Env_tab_res %>% select(Env_Term, fold, RankCor) %>% pivot_wider(id_cols = c("fold"), values_from = "RankCor", names_from = "Env_Term") Env_Sum <- bind_rows(D_10k_5Day$W_Env, D_10k_10Day$W_Env, D_10k_15Day$W_Env, D_10k_30Day$W_Env) %>% group_by(fold, model) %>% summarise(mean_pred_abi = mean(PredAbi)) ggplot(data = Env_tab_res, aes(x = Env_Term, y = PredAbi, color = "#3288BD", fill = "#3288BD")) + geom_boxplot(alpha = 0.5) + geom_point() + scale_color_manual(values = "#3288BD") + scale_fill_manual(values = "#3288BD") + theme_bw() + guides(color = FALSE, fill = FALSE) png(filename = paste0(OutDir, "Env_Term_Choice_Line.png"), width = 1250, height = round(1250 * 0.666), unit = "px") ggplot(data = Env_tab_res, aes(x = Env_Term, y = PredAbi, group = fold)) + geom_line(aes(color = as.character(fold + 2013))) + #geom_boxplot(alpha = 0.5, aes(fill = "#3288BD")) + geom_point(aes(color = as.character(fold + 2013)), cex = 6) + scale_color_manual(values = c("#D53E4F", "#F46D43", "#FDAE61")) + scale_fill_manual(values = "#3288BD") + labs(color = "Year", x = "Environment Term", y = "Predictive Ability") + scale_x_discrete(labels = c("EnvLabel" = "Environment \n Labels", "5Day" = "5 Day", "10Day" = "10 Day", "15Day" = "15 Day", "30Day" = "30 Day")) + theme_bw() + guides(fill = FALSE) + theme(legend.title = element_text(face = "bold", size = 20, color = "black"), legend.text = element_text(size = 18, color = "black"), axis.text = element_text(size = 18, color = "black"), axis.title = element_text(size = 18, color = "black"), plot.background = element_rect(color = "white", fill = "white"), legend.background = element_rect(color = "white", fill = "white"), panel.grid = element_line(color = "gray60")) dev.off() ``` ```{r LOYO Within Env Tabulation} W_Env_tab_res <- bind_rows(D_EnvLabels$W_Env, D_10k_5Day$W_Env, D_10k_10Day$W_Env, D_10k_15Day$W_Env, D_10k_30Day$W_Env) #, DW_5k_tab_res, DW_10k_tab_res, DW_15k_tab_res, DW_20k_tab_res) W_Env_tab_res$model <- str_remove(str_remove(W_Env_tab_res$model, pattern = "_fold_[0-9]{1,2}"), pattern = "CV1_") W_Env_tab_res$model <- factor(W_Env_tab_res$model, levels = c("D_10k_EnvLabel", "D_10k_5Day", "D_10k_10Day", "D_10k_15Day", "D_10k_30Day"))#, "D_5k_W", "D_10k_W", "D_15k_W", "D_20k_W")) W_Env_tab_res$Env_Term <- factor(str_remove(W_Env_tab_res$model, "D_10k_"), levels = c("EnvLabel", "5Day", "10Day", "15Day", "30Day")) W_Env_tab_res$Year <- factor(W_Env_tab_res$fold + 2013, levels = c("2014", "2015", "2016")) W_Env_tab_res %>% group_by(Env_Term, Year) %>% summarise(mean_pred_abi = round(mean(PredAbi), 3)) -> Group_Sum xtabs(mean_pred_abi ~Env_Term + Year, Group_Sum) ggplot(data = W_Env_tab_res, aes(x = Env_Term, y = PredAbi, color = Year, fill = Year)) + geom_boxplot(alpha = 0.5) + geom_point(position = position_jitterdodge()) + scale_color_brewer(palette = "Paired") + scale_fill_brewer(palette = "Paired") + theme_bw() + guides(fill = FALSE) test <- W_Env_tab_res %>% group_by(Year, Env_Term) %>% summarise(Mean_PredAbi = mean(PredAbi), Mean_RankCor = mean(RankCor)) ``` Leaving one year out the results seem to say that we should use the 5 Day weather data. When you look by environment the plots look quite similar. this might be because we have such a large pool of data - ##CV 65 (Leave one Environment Out) ```{r CV65 Env Label Model} folder <- "CV65/D_10k_EnvLabels/" #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB"), folds = 3) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data D_EnvLabels <- Load_Output(folder) D_EnvLabels$Predictions$Fold <- factor(D_EnvLabels$Predictions$Fold, levels = c(1:59)) ggplot(D_EnvLabels$W_Env, aes(x = Best10, y = PredAbi)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ``` ```{r CV65 Five Day Weather} folder <- "CV65/D_10k_5Day/" #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB", "ETA_Env_lambda"), folds = 3) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data D_10k_5Day <- Load_Output(folder) D_10k_5Day$Predictions$Fold <- factor(D_10k_5Day$Predictions$Fold, levels = c(1:59)) #ggplot(D_10k_5Day$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ggplot(D_10k_5Day$W_Env, aes(x = Env, y = PredAbi)) + geom_boxplot() + geom_point() ggplot(D_10k_5Day$W_Env, aes(x = fold, y = PredAbi)) + geom_point() #Should see no correlation between fold and predictive accuracy since folds are random ``` ```{r CV65 Ten Day Model} folder <- "CV65/D_10k_10Day/" #Check that the run was complete D_10k_10Day_out <- Confirm_Complete(folder) table(D_10k_10Day_out$out_complete) table(D_10k_10Day_out$errs_present) #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB", "ETA_Env_lambda"), folds = 3) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data D_10k_10Day_tab_res <- Read_Tab_Results(folder) D_10k_10Day_W_Env <- Read_Within_Env_Results(folder) D_10k_10Day$Predictions <- Read$Predictions(folder) D_10k_10Day$Predictions$Fold <- factor(D_10k_10Day$Predictions$Fold, levels = c(1:59)) #ggplot(D_10k_10Day$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ggplot(D_10k_10Day_W_Env, aes(x = Env, y = PredAbi)) + geom_boxplot() + geom_point() ggplot(D_10k_10Day_W_Env, aes(x = fold, y = PredAbi)) + geom_point() #Should see no correlation between fold and predictive accuracy since folds are random ``` ```{r CV65 Fifteen Day Weather} folder <- "CV65/D_10k_15Day/" #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB", "ETA_Env_lambda"), folds = 3) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data D_10k_15Day <- Load_Output(folder) D_10k_15Day$Predictions$Fold <- factor(D_10k_15Day$Predictions$Fold, levels = c(1:59)) #ggplot(D_10k_15Day$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ggplot(D_10k_15Day_W_Env, aes(x = Env, y = PredAbi)) + geom_boxplot() + geom_point() ggplot(D_10k_15Day_W_Env, aes(x = fold, y = PredAbi)) + geom_point() #Should see no correlation between fold and predictive accuracy since folds are random ``` ```{r CV65 Thirty Day Model} folder <- "CV65/D_10k_30Day/" #Check diagnostics and output plots Bayes_Diagnostic_Check(folder, params = c("mu", "varE", "ETA_Geno_varB", "ETA_Env_lambda"), folds = 3) merge_pngs(png_folder = folder, WorkDir = WorkDir) #Load in result data D_10k_30Day <- Load_Output(folder) D_10k_30Day$Predictions$Fold <- factor(D_10k_30Day$Predictions$Fold, levels = c(1:59)) #ggplot(D_10k_30Day$Predictions, aes(x = Phenotype, y = GEBV, color = Fold)) + geom_point() + scale_color_brewer(palette = "Spectral") + theme_bw() ggplot(D_10k_30Day_W_Env, aes(x = Env, y = PredAbi)) + geom_boxplot() + geom_point() ggplot(D_10k_30Day_W_Env, aes(x = fold, y = PredAbi)) + geom_point() #Should see no correlation between fold and predictive accuracy since folds are random ``` ```{r CV65 Rough Tabulated Analysis} CV65_Env_tab_res <- bind_rows(D_EnvLabels$tab_res, D_10k_5Day$tab_res, D_10k_10Day$tab_res, D_10k_15Day$tab_res, D_10k_30Day$tab_res) #, DW_5k$tab_res, DW_10k$tab_res, DW_15k$tab_res, DW_20k$tab_res) CV65_Env_tab_res$model <- str_remove(str_remove(CV65_Env_tab_res$model, pattern = "_fold_[0-9]{1,2}"), pattern = "CV65_") CV65_Env_tab_res$model <- factor(CV65_Env_tab_res$model, levels = c("D_10k_EnvLabel", "D_10k_5Day", "D_10k_10Day", "D_10k_15Day", "D_10k_30Day"))#, "D_5k_W", "D_10k_W", "D_15k_W", "D_20k_W")) CV65_Env_tab_res$Env_Term <- factor(str_remove(CV65_Env_tab_res$model, "D_10k_"), levels = c("EnvLabel", "5Day", "10Day", "15Day", "30Day")) CV65_Env_tab_res$Window <- as.numeric(str_remove(CV65_Env_tab_res$Env_Term, "Day")) CV65_Env_wide <- CV65_Env_tab_res %>% select(Env_Term, fold, PredAbi) %>% pivot_wider(id_cols = c("fold"), values_from = "PredAbi", names_from = "Env_Term") kable(Env_wide) RankCor_wide <- CV65_Env_tab_res %>% select(Env_Term, fold, RankCor) %>% pivot_wider(id_cols = c("fold"), values_from = "RankCor", names_from = "Env_Term") ggplot(data = CV65_Env_tab_res, aes(x = Env_Term, y = PredAbi, color = "#3288BD", fill = "#3288BD")) + geom_boxplot(alpha = 0.5) + geom_point() + scale_color_manual(values = "#3288BD") + scale_fill_manual(values = "#3288BD") + theme_bw() + guides(color = FALSE, fill = FALSE) ggplot(data = CV65_Env_tab_res, aes(x = Env_Term, y = PredAbi, fill = "#3288BD")) + geom_boxplot(alpha = 0.5) + geom_point(aes(color = as.character(fold)), cex = 4) + scale_color_brewer(palette = "Spectral") + scale_fill_manual(values = "#3288BD") + theme_bw() + guides(fill = FALSE) ``` ```{r CV65 Within Env Tabulation} W_Env_tab_res <- bind_rows(D_EnvLabels$W_Env, D_10k_5Day$W_Env, D_10k_10Day$W_Env, D_10k_15Day$W_Env, D_10k_30Day$W_Env) #, DW_5k_tab_res, DW_10k_tab_res, DW_15k_tab_res, DW_20k_tab_res) W_Env_tab_res$model <- str_remove(str_remove(W_Env_tab_res$model, pattern = "_fold_[0-9]{1,2}"), pattern = "CV1_") W_Env_tab_res$model <- factor(W_Env_tab_res$model, levels = c("D_10k_EnvLabel", "D_10k_5Day", "D_10k_10Day", "D_10k_15Day", "D_10k_30Day"))#, "D_5k_W", "D_10k_W", "D_15k_W", "D_20k_W")) W_Env_tab_res$Env_Term <- factor(str_remove(W_Env_tab_res$model, "D_10k_"), levels = c("EnvLabel", "5Day", "10Day", "15Day", "30Day")) W_Env_tab_res$Year <- factor(W_Env_tab_res$fold + 2013, levels = c("2014", "2015", "2016")) ggplot(data = W_Env_tab_res, aes(x = Env_Term, y = PredAbi, color = Year, fill = Year)) + geom_boxplot(alpha = 0.5) + geom_point(position = position_jitterdodge()) + scale_color_brewer(palette = "Paired") + scale_fill_brewer(palette = "Paired") + theme_bw() + guides(fill = FALSE) test <- W_Env_tab_res %>% group_by(Year, Env_Term) %>% summarise(Mean_PredAbi = mean(PredAbi), Mean_RankCor = mean(RankCor)) ``` Examine which covariates are used for the Env portion of each of the folds -which are consistent/have high effect size? ```{r LOYO Examine betas for env covs} load(paste0(WorkDir, "LOYCV/D_10k_5Day/CV1_D_10k_5Day_fold_1_Model.RData")) Env_betas_1 <- Test_Mod$ETA$Env$b SD_Env_betas_1 <- Test_Mod$ETA$Env$SD.b load(paste0(WorkDir, "LOYCV/D_10k_5Day/CV1_D_10k_5Day_fold_2_Model.RData")) Env_betas_2 <- Test_Mod$ETA$Env$b SD_Env_betas_2 <- Test_Mod$ETA$Env$SD.b load(paste0(WorkDir, "LOYCV/D_10k_5Day/CV1_D_10k_5Day_fold_3_Model.RData")) Env_betas_3 <- Test_Mod$ETA$Env$b SD_Env_betas_3 <- Test_Mod$ETA$Env$SD.b Env_betas_comb <- as.data.frame(rbind(Env_betas_1, Env_betas_2, Env_betas_3)) SD_Env_betas_comb <- as.data.frame(rbind(SD_Env_betas_1, SD_Env_betas_2, SD_Env_betas_3)) Env_betas_long <- Env_betas_comb %>% pivot_longer(names_to = "Env_Cov", cols = 1:377, values_to = "Beta") Env_betas_long$Fold <- rep(c(1:3), each = nrow(Env_betas_long)/3) Summary_Beta <- Env_betas_long %>% group_by(Env_Cov) %>% summarise(mean = mean(Beta), sd = sd(Beta), var = var(Beta)) ``` Having Chosen the 5 Day Env Covariates we now move to GxE modeling. Two options here, Dimension reduction via PCA or Feature Selection for Env Cov's to use for GxE. ### GxE Modeling #### Baseline Method ```{r Baseline Data} PC_E_folders <- paste0("Baseline/Base_GxE_PC_Env/BASELINE_", seq(10, 60, by = 10), "/") No_GxE_Folders <- paste0("Baseline/Base_No_GxE/No_GxE_", seq(10, 60, by = 10), "/") PC_G_Folders <- paste0("Baseline/Base_GxE_PC_Gen/Base_", seq(10, 60, by = 10), "/") folder <- c(PC_E_folders, No_GxE_Folders, PC_G_Folders) Baseline_No_GxE <- Read_Folders(No_GxE_Folders) Baseline_No_GxE <- Model_Type(output_list = Baseline_No_GxE, given_method = "G + E") Baseline_PC_Env <- Read_Folders(PC_E_folders) Baseline_PC_Env <- Model_Type(output_list = Baseline_PC_Env, given_method = "PC(Env)") Baseline_PC_Gen <- Read_Folders(PC_G_Folders) Baseline_PC_Gen <- Model_Type(output_list = Baseline_PC_Gen, given_method = "PC(Markers)") base_tab <- Baseline_No_GxE$tab_res %>% bind_rows(Baseline_PC_Env$tab_res) %>% bind_rows(Baseline_PC_Gen$tab_res) base_out <- Baseline_No_GxE$out %>% bind_rows(Baseline_PC_Env$out) %>% bind_rows(Baseline_PC_Gen$out) base_within_env <- Baseline_No_GxE$W_Env %>% bind_rows(Baseline_PC_Env$W_Env) %>% bind_rows(Baseline_PC_Gen$W_Env) base_predictions <- Baseline_No_GxE$Predictions %>% bind_rows(Baseline_PC_Env$Predictions) %>% bind_rows(Baseline_PC_Gen$Predictions) #base_predictions <- base_predictions %>% head(20) %>% separate(model, into = c('cvm_dup', 'perc_missing', 'D', 'n_mrk', 'E', 'fold_text', 'fold'), sep = '_', remove = FALSE) %>% select() base_tab$Model <- base_tab$Type base_tab$Type <- "GxE" base_tab$Type[base_tab$Model == "G + E"] <- "No_GxE" #base_within_env <- base_out%>% select(fold, n_trn, n_test, folder) %>% right_join(base_within_env) GxE_Comp <- base_tab %>% filter(!str_detect(Type, "No_GxE")) %>% pivot_wider(id_cols = c("n_trn", "n_test", "fold", "Model"), names_from = Type, values_from = PredAbi) %>% mutate(Perc_Missing = round(n_test/16106 * 100, digits = -1)) No_GxE_Pred <- base_tab%>% filter(str_detect(Type, "No_GxE")) %>% pivot_wider(id_cols = c("n_trn", "n_test", "fold", "Model"), names_from = Type, values_from = PredAbi) %>% select(-Model) %>% mutate(Perc_Missing = round(n_test/16106 * 100, digits = -1)) GxE_Comp <- GxE_Comp %>% left_join(y = No_GxE_Pred, by = c("fold", "Perc_Missing")) %>% mutate(Diff = GxE - No_GxE) ``` ```{r Function to compute AnSci version of Bias} Compute_Slopes <- function(Pred_Obj, model_info, prefix){ Running_Bias <- data.frame() #Place to put overall bias #Compute overall bias for fold for (i in model_info){ cat(paste0('\nWorking on model ', match(i, model_info), ' of ', length(model_info), '.')) df <- Pred_Obj %>% filter(model == i) mod <- lm(Phenotype ~ GEBV, data = df) Fake_Bias <- mod$coefficients[2] new_row <- c(i, df$Fold[df$model == i][1], df$CV_Method[df$model == i][1], df$Type[df$model == i][1], Fake_Bias) Running_Bias <- as.data.frame(matrix(new_row, ncol = 5)) colnames(Running_Bias) <- c("Model", "Fold", "CV_Method", "Type", "Slope") Running_Bias <- Running_Bias %>% mutate(Slope = as.numeric(Slope)) write.table(Running_Bias, file = paste0(OutDir, prefix, "_Running_Slopes.csv"), sep = ',', append = TRUE, row.names = FALSE, col.names = !file.exists(paste0(OutDir, prefix, '_Running_Slopes.csv'))) #Compute within env bias for the fold unique_envs <- unique(df$Env) w_env_bias <- data.frame() #Place to put within env bias results for(j in 1:length(unique_envs)){ env_sub <- df %>% filter(Env == unique_envs[j]) if(nrow(env_sub) >= 8){ env_mod <- lm(Phenotype ~ GEBV, data = env_sub) Slope <- env_mod$coefficients[2] env_new_row <- c(j, env_sub$Env[1], env_sub$Fold[1], env_sub$model[1], env_sub$CV_Method[1], env_sub$Type[1], Slope) w_env_bias <- rbind(w_env_bias, env_new_row) } else { next } } names(w_env_bias) <- c("Env_Num", "Env", "Fold", "Model", "CV_Method", "Type", "Slope") w_env_bias <- w_env_bias %>% mutate(Slope = as.numeric(Slope)) write.table(w_env_bias, file = paste0(OutDir, prefix, '_w_env_slopes.csv'), sep = ',', append = TRUE, row.names = FALSE, col.names = !file.exists(paste0(OutDir, prefix, '_w_env_slopes.csv'))) } #Bias_Obj <- list(Overall_Bias = Running_Bias, W_Env_Bias = w_env_bias) #Bias_Obj$W_Env_Summary <- Bias_Obj$W_Env_Bias %>% # group_by(Env) %>% # summarise(min_slope = min(Slope), mean_slope = mean(Slope, na.rm = TRUE), med_slope = median(Slope), max_slope = max(Slope), var_slope = var(Slope)) return(paste0('Slope computation complete. Output is in: ', OutDir, prefix, " .csv files")) } Compute_Slopes(Pred_Obj = base_predictions, model_info = unique(base_predictions$model), prefix = 'BASELINE') ``` ```{r Output Baseline Plot} png(filename = paste0(OutDir, "Train_G_Vs_GxE.png"), width = 1500, height = 1000, units = "px") ggplot(base_tab, aes(x = n_trn, y = PredAbi, group = Model, color = Model)) + geom_point(aes(size = 1), alpha = 0.85) + ylim(0.7, 0.9) + xlim(6000, 16000) + scale_color_manual(values = c("#66C2A5", "#3288BD", "#5E4FA2"), labels = c("No GxE", "PCA(Env) * Markers", "PCA(Markers) * Env")) + theme_bw() + geom_smooth(se = TRUE, method = "lm", aes(size = 0.9)) + theme(legend.title = element_text(face = "bold", size = 20, color = "black"), legend.text = element_text(size = 18, color = "black"), axis.text = element_text(size = 18, color = "black"), axis.title = element_text(size = 18, color = "black"), plot.background = element_rect(color = "white", fill = "white"), legend.background = element_rect(color = "white", fill = "white"), panel.grid = element_line(color = "gray60")) + ylab("Predictive Ability") + xlab("Training Set Size") + guides(size = FALSE) dev.off( base_within_env$Model <- base_within_env$Type #base_within_env$Type <- "GxE" #base_within_env$Type[str_detect(base_within_env$folder, pattern = "No_GxE")] <- "No_GxE" base_within_env$Type[str_detect(base_within_env$Model, pattern = "PC")] <- "GxE" base_within_env$Type[!str_detect(base_within_env$Model, pattern = "PC")] <- "No_GxE" WEnv_GxE_comp <- base_within_env %>% left_join(select(base_tab, model, n_trn, n_test)) %>% filter(Type == "GxE") %>% mutate(folder = str_extract(model, pattern = "[1-6]0")) %>% pivot_wider(id_cols = c("Env", "n_trn", "n_test", "fold", "folder", "Model"), names_from = Type, values_from = PredAbi) WEnv_No <- base_within_env %>% left_join(select(base_tab, model, n_trn, n_test)) %>% filter(Type != "GxE") %>% mutate(folder = str_extract(model, pattern = "[1-6]0")) %>% pivot_wider(id_cols = c("Env", "n_trn", "n_test", "fold", "folder", "Model"), names_from = Type, values_from = PredAbi) WEnv_GxE_comp <- WEnv_GxE_comp %>% left_join(y = WEnv_No, by = c("Env", "fold", "n_trn", "n_test", "folder")) %>% mutate(Diff = GxE - No_GxE) ``` ```{r} MeanW_EnvComp <- WEnv_GxE_comp %>% filter(str_detect(`Model.x`, "PC\\(M")) %>% group_by(Env) %>% summarise(mean_diff = mean(Diff, na.rm = TRUE)) %>% left_join(select(Stage1_ModelingOut, "Env", "H.mean", "Verr")) WEnv_GxE_comp$folder <- 100 - WEnv_GxE_comp$folder png(filename = paste0(OutDir, "Train_Box_GvsGxE.png"), width = 2000, height = 1500, units = "px") #Boxplot by train size within env ggplot(WEnv_GxE_comp[WEnv_GxE_comp$Model.x == "PC(Markers)", ], aes(x = Env, y = Diff, fill = as.character(100 - as.numeric(folder)))) + #facet_wrap(~ Env) + geom_boxplot(alpha = 0.6) + theme_bw() + geom_hline(yintercept = 0, color = "red") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_fill_brewer(name = "Percent data in train", palette = "Paired") + theme(legend.title = element_text(face = "bold", size = 20, color = "black"), legend.text = element_text(size = 18, color = "black"), axis.text = element_text(size = 18, color = "black"), axis.title = element_text(size = 18, color = "black"), plot.background = element_rect(color = "white", fill = "white"), legend.background = element_rect(color = "white", fill = "white"), panel.grid = element_line(color = "gray60")) + xlab("Environment") + ylab("GxE Predictive Ability - G + E Predictive Ability") #+ coord_flip() dev.off() Envs <- unique(base_within_env$Env) for(i in 1:length(Envs)){ env_sub <- base_within_env[base_within_env$Env == Envs[i], ] #png(filename = paste0(OutDir, "Baseline_Within_Env_Curves/", Envs[i], ".png"), height = 750, width = 1000, units = "px") w_env_plot <- ggplot(env_sub, aes(x = n_trn, y = PredAbi)) + geom_point() + ggtitle(label = Envs[i]) + geom_smooth() + theme_bw() + ylim(0, 1) + xlim(5000, 16000) #dev.off() ggsave(filename = paste0(OutDir, "Baseline_Within_Env_Curves/", Envs[i], ".png"), plot = w_env_plot, height = 4, width = 6, units = "in") print(paste0("Env ", i, " Complete.")) } merge_pngs(png_folder = "Baseline_Within_Env_Curves/", WorkDir = OutDir, OutDir = OutDir) w_env_base <- base_within_env %>% group_by(folder) %>% summarise(mean_trn = mean(n_trn), mean_pred_abi = mean(PredAbi), sd_pred_abi = sd(PredAbi), count = n()) %>% mutate(se_pred_abi = sd_pred_abi/count) ``` ```{r Read in All Models Data, echo=FALSE} #folder_list <- list.dirs(paste0(WorkDir, "Full_GxE_PC_G/")) #GxE_DimRed/ #folder_list <- str_replace(folder_list, "//", "/") #folder_list <- str_remove(folder_list, WorkDir) #folder_list_a <- folder_list[!str_detect(folder_list, pattern = "Baseline")] #folder_list_a <- folder_list_a[-1] #Remove top level folder #folder_list_a <- paste0(folder_list_a, "/") #GxE_Base <- base_comb[base_comb$Type == "GxE" & str_detect(base_comb$Model, pattern = "PC\\(M"), ] Env_CV_folder_list <- Create_Folder_List(WorkDir = WorkDir, "Env_CV/") Env_CV <- Read_Folders(folder_list = Env_CV_folder_list) Env_CV <- Model_Type(Env_CV, given_method = "G + E") #Try it again for the GxE PCA(Env) results PC_E_folder_list <- Create_Folder_List(WorkDir = WorkDir, "GxE_Pc_Env/") PC_E_CV <- Read_Folders(folder_list = PC_E_folder_list) PC_E_CV <- Model_Type(PC_E_CV, given_method = "PC(Env)") Pc_M_folder_list <- Create_Folder_List(WorkDir = WorkDir, "GxE_PC_Gen/") PC_G_CV <- Read_Folders(folder_list = Pc_M_folder_list) PC_G_CV <- Model_Type(PC_G_CV, given_method = "PC(Markers)") tab_combined <- Env_CV$tab_res %>% bind_rows(PC_E_CV$tab_res) %>% bind_rows(PC_G_CV$tab_res) w_env_combined <- Env_CV$W_Env %>% bind_rows(PC_E_CV$W_Env) %>% bind_rows(PC_G_CV$W_Env) %>% left_join(y = Stage1_ModelingOut) Pred_combined <- Env_CV$Predictions %>% bind_rows(PC_E_CV$Predictions) %>% bind_rows(PC_G_CV$Predictions) load(paste0(WorkDir, "R_data_objects/PhenoG2F.Rdata")) join_info <- PhenoG2F %>% select(Env, Hybrid, RandHybFold, YearCluster, EnvClusters, WardClusters, CV2_Fold_Num, CV3_Fold, CV4_Fold, CV5_Fold, CV6_Fold, CV65) %>% rename(CV1_Fold = RandHybFold, LOYO_Fold = YearCluster, LORE_Fold = EnvClusters, LORH_Fold = WardClusters, CV2_Fold = CV2_Fold_Num, CV65_Fold = CV65) %>% pivot_longer(cols = ends_with("Fold"), names_to = "CV_Method", values_to = "Fold") %>% mutate(CV_Method = str_remove(CV_Method, "_Fold")) combined_bias <- Pred_combined %>% right_join(y = join_info) %>% group_by(CV_Method, Env, Type) %>% summarise(Mean_GEBV = mean(GEBV, na.rm = TRUE), Mean_Phenotype = mean(Phenotype, na.rm = TRUE)) %>% mutate(Bias = Mean_GEBV - Mean_Phenotype) Per_fold_combined_bias <- Pred_combined %>% right_join(y = join_info) %>% group_by(CV_Method, Env, Type, Fold) %>% summarise(Mean_GEBV = mean(GEBV, na.rm = TRUE), Mean_Phenotype = mean(Phenotype, na.rm = TRUE)) %>% mutate(Bias = Mean_GEBV - Mean_Phenotype) %>% rename(fold = Fold) fold_counts <- Pred_combined %>% right_join(y = join_info) %>% group_by(CV_Method, Env, Fold, Type) %>% summarise(n = n()) %>% rename(fold = Fold) w_env_combined <- w_env_combined %>% left_join(y = Per_fold_combined_bias) %>% left_join(fold_counts) w_env_combined_nofilter <- w_env_combined w_env_combined <- w_env_combined %>% filter(n >= 8) ``` Statistics for bidirectional schemes need to be recomputed because initial calculations were done on all held out observations, rather than just the observations with no relationship to the training data ```{r Recompute corrs for Bidirectional schemes} Fold_Info_Join <- PhenoG2F %>% select(Env, Hybrid, CV3_Fold, CV4_Fold, CV5_Fold, CV6_Fold) #%>% pivot_longer(cols = ends_with("Fold"), names_to = "CV_Method", values_to = "Fold") %>% mutate(CV_Method = str_remove(CV_Method, "_Fold")) #test <- Pred_combined %>% right_join(y = Fold_Info_Join) BiDirectional_Recompute <- Pred_combined %>% filter(CV_Method %in% c("CV3", "CV4", "CV5", "CV6")) %>% left_join(Fold_Info_Join) #Loop through to compute accuracies Across_Env_Frame <- {} Within_Env_Frame <- {} for(Model in c("G + E", "PC(Env)", "PC(Markers)")){ Type_Sub <- BiDirectional_Recompute %>% filter(Type == Model) #Compute Across Env Accuracies with just the intersection set CV3_Tab <- Type_Sub %>% filter(CV3_Fold == Fold, CV_Method == "CV3") %>% group_by(Fold) %>% summarise(n = n(), PredAbi = round(cor(Phenotype, GEBV, use = "pairwise.complete", method = "pearson"), 3), RankCor = round(cor(Phenotype, GEBV, use = "pairwise.complete", method = "spearman"), 3), MSE = round(mean((Phenotype - GEBV)^2, na.rm = TRUE), 3), Best10 = round(mean(tail(sort(Phenotype, decreasing = TRUE), n = ceiling(n*0.1)), na.rm = TRUE), 3), CV_Method = "CV3", Type = Model) %>% rename(n_test = n) CV4_Tab <- Type_Sub %>% filter(CV4_Fold == Fold, CV_Method == "CV4") %>% group_by(Fold) %>% summarise(n = n(), PredAbi = round(cor(Phenotype, GEBV, use = "pairwise.complete", method = "pearson"), 3), RankCor = round(cor(Phenotype, GEBV, use = "pairwise.complete", method = "spearman"), 3), MSE = round(mean((Phenotype - GEBV)^2, na.rm = TRUE), 3), Best10 = round(mean(tail(sort(Phenotype, decreasing = TRUE), n = ceiling(n*0.1)), na.rm = TRUE), 3), CV_Method = "CV4", Type = Model) %>% rename(n_test = n) CV5_Tab <- Type_Sub %>% filter(CV5_Fold == Fold, CV_Method == "CV5") %>% group_by(Fold) %>% summarise(n = n(), PredAbi = round(cor(Phenotype, GEBV, use = "pairwise.complete", method = "pearson"), 3), RankCor = round(cor(Phenotype, GEBV, use = "pairwise.complete", method = "spearman"), 3), MSE = round(mean((Phenotype - GEBV)^2, na.rm = TRUE), 3), Best10 = round(mean(tail(sort(Phenotype, decreasing = TRUE), n = ceiling(n*0.1)), na.rm = TRUE), 3), CV_Method = "CV5", Type = Model) %>% rename(n_test = n) CV6_Tab <- Type_Sub %>% filter(CV6_Fold == Fold, CV_Method == "CV6") %>% group_by(Fold) %>% summarise(n = n(), PredAbi = round(cor(Phenotype, GEBV, use = "pairwise.complete", method = "pearson"), 3), RankCor = round(cor(Phenotype, GEBV, use = "pairwise.complete", method = "spearman"), 3), MSE = round(mean((Phenotype - GEBV)^2, na.rm = TRUE), 3), Best10 = round(mean(tail(sort(Phenotype, decreasing = TRUE), n = ceiling(n*0.1)), na.rm = TRUE), 3), CV_Method = "CV6", Type = Model) %>% rename(n_test = n) Across_Env_Combined <- bind_rows(CV3_Tab, CV4_Tab, CV5_Tab, CV6_Tab) Across_Env_Frame <- rbind(Across_Env_Frame, Across_Env_Combined) #Compute within Env Results CV3_WEnv <- Type_Sub %>% filter(CV3_Fold == Fold, CV_Method == "CV3") %>% group_by(Fold, Env) %>% summarise(n = n(), PredAbi = round(cor(Phenotype, GEBV, use = "pairwise.complete", method = "pearson"), 3), RankCor = round(cor(Phenotype, GEBV, use = "pairwise.complete", method = "spearman"), 3), MSE = round(mean((Phenotype - GEBV)^2, na.rm = TRUE), 3), Best10 = round(mean(tail(sort(Phenotype, decreasing = TRUE), n = ceiling(n*0.1)), na.rm = TRUE), 3), Bias = (mean(GEBV, na.rm = TRUE) - mean(Phenotype, na.rm = TRUE)), CV_Method = "CV3", Type = Model) %>% rename(n_test = n) CV4_WEnv <- Type_Sub %>% filter(CV4_Fold == Fold, CV_Method == "CV4") %>% group_by(Fold, Env) %>% summarise(n = n(), PredAbi = round(cor(Phenotype, GEBV, use = "pairwise.complete", method = "pearson"), 3), RankCor = round(cor(Phenotype, GEBV, use = "pairwise.complete", method = "spearman"), 3), MSE = round(mean((Phenotype - GEBV)^2, na.rm = TRUE), 3), Best10 = round(mean(tail(sort(Phenotype, decreasing = TRUE), n = ceiling(n*0.1)), na.rm = TRUE), 3), Bias = (mean(GEBV, na.rm = TRUE) - mean(Phenotype, na.rm = TRUE)), CV_Method = "CV4", Type = Model) %>% rename(n_test = n) CV5_WEnv <- Type_Sub %>% filter(CV5_Fold == Fold, CV_Method == "CV5") %>% group_by(Fold, Env) %>% summarise(n = n(), PredAbi = round(cor(Phenotype, GEBV, use = "pairwise.complete", method = "pearson"), 3), RankCor = round(cor(Phenotype, GEBV, use = "pairwise.complete", method = "spearman"), 3), MSE = round(mean((Phenotype - GEBV)^2, na.rm = TRUE), 3), Best10 = round(mean(tail(sort(Phenotype, decreasing = TRUE), n = ceiling(n*0.1)), na.rm = TRUE), 3), Bias = (mean(GEBV, na.rm = TRUE) - mean(Phenotype, na.rm = TRUE)), CV_Method = "CV5", Type = Model) %>% rename(n_test = n) CV6_WEnv <- Type_Sub %>% filter(CV6_Fold == Fold, CV_Method == "CV6") %>% group_by(Fold, Env) %>% summarise(n = n(), PredAbi = round(cor(Phenotype, GEBV, use = "pairwise.complete", method = "pearson"), 3), RankCor = round(cor(Phenotype, GEBV, use = "pairwise.complete", method = "spearman"), 3), MSE = round(mean((Phenotype - GEBV)^2, na.rm = TRUE), 3), Best10 = round(mean(tail(sort(Phenotype, decreasing = TRUE), n = ceiling(n*0.1)), na.rm = TRUE), 3), Mean_GEBV = mean(GEBV, na.rm = TRUE), Mean_Phenotype = mean(Phenotype, na.rm = TRUE), Bias = (mean(GEBV, na.rm = TRUE) - mean(Phenotype, na.rm = TRUE)), CV_Method = "CV6", Type = Model) %>% rename(n_test = n) Recompute_WEnv <- bind_rows(CV3_WEnv, CV4_WEnv, CV5_WEnv, CV6_WEnv) Within_Env_Frame <- rbind(Within_Env_Frame, Recompute_WEnv) } #From both recomputes we need to drop any calculations that were made on less than 8 observations. Across_Env_Frame <- Across_Env_Frame %>% filter(n_test >= 8) Within_Env_Frame <- Within_Env_Frame %>% filter(n_test >= 8) #Compute proportion of within env results were dropped - #Then we need to reintegrate these result into our tab and w_env data frames for graphing/further analysis #Get our results combined with the log likelihood and computation times from the tab_combined frame Across_Env_Frame <- Across_Env_Frame %>% rename(fold = Fold) %>% left_join(select(tab_combined, fold, logLikAtPostMean, postMeanLogLik, pD, DIC, varE, User_Time, System_Time, Elapsed_Time, model, n_trn, CV_Method, Type)) %>% select(names(tab_combined)) #Replace the results from tab_combined with the accuracies computed on the correct test sets tab_combined <- tab_combined %>% filter(!(CV_Method %in% c("CV3", "CV4", "CV5", "CV6"))) %>% bind_rows(Across_Env_Frame) Within_Env_Frame <- Within_Env_Frame %>% rename(fold = Fold, n = n_test) %>% left_join(select(w_env_combined, Env, fold, CV_Method, Type, Verr, H.plot, H.mean)) #Filter out all the within env results previously computed and input the corrected results. w_env_combined <- w_env_combined %>% filter(!(CV_Method %in% c("CV3", "CV4", "CV5", "CV6"))) %>% bind_rows(Within_Env_Frame) tab_combined$CV_Method <- factor(tab_combined$CV_Method, levels = c("CV1", "CV2", "LOYO", "LORE", "LORH", "CV65", "CV3", "CV4", "CV6", "CV5")) w_env_combined$CV_Method <- factor(w_env_combined$CV_Method, levels = c("CV1", "CV2", "LOYO", "LORE", "LORH", "CV65", "CV3", "CV4", "CV6", "CV5")) ``` Suset predictions to correctly compute slopes -need to eliminate those from bidirectional schemes that are in teh discard sections ```{r} Uni_Preds <- Pred_combined[!(Pred_combined$CV_Method %in% c('CV3', 'CV4', 'CV5', 'CV6')), ] CV3_Preds <- BiDirectional_Recompute %>% filter(CV3_Fold == Fold) CV4_Preds <- BiDirectional_Recompute %>% filter(CV4_Fold == Fold) CV5_Preds <- BiDirectional_Recompute %>% filter(CV5_Fold == Fold) CV6_Preds <- BiDirectional_Recompute %>% filter(CV6_Fold == Fold) Recomp_Set <- Uni_Preds %>% bind_rows(CV3_Preds) %>% bind_rows(CV4_Preds) %>% bind_rows(CV5_Preds) %>% bind_rows(CV6_Preds) rm(Uni_Preds, CV3_Preds, CV4_Preds, CV5_Preds, CV6_Preds, BiDirectional_Recompute, Pred_combined) Compute_Slopes(Recomp_Set, model_info = unique(Recomp_Set$model), prefix = 'Full_Slopes') W_Env_Slopes <- read.csv(paste0(OutDir, 'Full_Slopes_w_env_slopes.csv')) Full_Slopes <- read.csv(paste0(OutDir, 'Full_Slopes_Running_Slopes.csv')) Env_Mean_Slopes <- W_Env_Slopes %>% group_by(Type, CV_Method, Env) %>% summarise(Mean_Slope = mean(Slope), Var_Slope = var(Slope)) Env_Mean_Slopes$CV_Method <- factor(Env_Mean_Slopes$CV_Method, levels = c("CV1", "CV2", "LOYO", "LORE", "LORH", "CV65", "CV3", "CV4", "CV6", "CV5")) W_Env_Slopes$CV_Method <- factor(W_Env_Slopes$CV_Method, levels = c("CV1", "CV2", "LOYO", "LORE", "LORH", "CV65", "CV3", "CV4", "CV6", "CV5")) Full_Slopes$CV_Method <- factor(Full_Slopes$CV_Method, levels = c("CV1", "CV2", "LOYO", "LORE", "LORH", "CV65", "CV3", "CV4", "CV6", "CV5")) ggplot(W_Env_Slopes, aes(x = Env, y = Slope, Fill = Type)) + geom_boxplot() + geom_hline(yintercept = 1) + facet_grid(CV_Method ~ Type) Type_lab <- c("G + E" = "G + E", "PC(Env)" = "PCA(Env) * Markers", "PC(Markers)" = "PCA(Markers) * Env") CV_lab <- c("CV1" = "CV1", "CV2" = "CV2", "LOYO" = "LO1Y", "LORE" = "LORE", "LORH" = "LORH", "CV65" = "LO1E", "CV3" = "CV1 + \n LO1Y", "CV4" = "LORH + \n LO1Y", "CV6" = "CV1 + \n LORE", "CV5" = "LORH + \n LORE") png(filename = paste0(OutDir, 'W_Env_Slope_Grid.png'), height = 1000, width = 1300, units = "px") ggplot(Env_Mean_Slopes, aes(x = Env, y = 1, fill = Mean_Slope)) + geom_tile() + facet_grid(CV_Method ~ Type, labeller = labeller(Type =Type_lab, CV_Method = CV_lab)) + scale_fill_gradient2(low = "#D53E4F", mid = "white", high = "#3288DB", midpoint = 1) + theme_bw() + xlab("Environment") + theme(text = element_text(size = 18, color = "black"), axis.title = element_text(face = "bold", size = 18, color = "black"), axis.title.y = element_blank(), axis.text.x = element_blank(), legend.title = element_text(size = 18, face = "bold", color = "black"), legend.text = element_text(size = 14, color = "black"), axis.text = element_blank(), plot.background = element_rect(color = "white", fill = "white"), legend.background = element_rect(color = "white", fill = "white"), panel.grid = element_line(color = "gray60"), plot.margin = margin(0,0,0,0, "pt"), strip.background = element_rect(fill = "white"), strip.text = element_text(face = "bold")) + scale_y_discrete(expand = c(0, 0)) dev.off() ``` ```{r Revision Plot Output} png(filename = paste0(OutDir, 'Within_Env_Slopes.png'), width = 1500, height = 800, units = "px") #This plot, with colors based on other graphs ggplot(W_Env_Slopes, aes(x = CV_Method, y = Slope)) + geom_boxplot(aes(fill = Type), alpha = 0.75) + geom_hline(yintercept = 1, size = 1.2) + ylab('Within Environment Slope') + xlab('Sampling Method') + scale_x_discrete(labels = c("CV1" = "CV1", "CV2" = "CV2", "LOYO" = "LO1Y", "LORE" = "LORE", "LORH" = "LORH", "CV65" = "LO1E", "CV3" = "CV1 + \n LO1Y", "CV4" = "LORH + \n LO1Y", "CV6" = "CV1 + \n LORE", "CV5" = "LORH + \n LORE")) + theme_bw() + scale_fill_manual(values = c("#66C2A5", "#3288BD", "#5E4FA2"), labels = c("No GxE", "PCA(Env) * Markers", "PCA(Markers) * Env"), name = "Model") + theme(legend.title = element_text(face = "bold", size = 20, color = "black"), legend.text = element_text(size = 18, color = "black"), axis.text = element_text(size = 18, color = "black"), axis.title = element_text(face = "bold", size = 18, color = "black"), plot.background = element_rect(color = "white", fill = "white"), legend.background = element_rect(color = NA, fill = NA), legend.justification = c(0,0) , legend.position = c(0,0), legend.key = element_blank(), panel.grid = element_line(color = "gray60")) dev.off() png(filename = paste0(OutDir, 'Across_Env_Slopes.png'), width = 1500, height = 800, units = "px") #Also this plot. With same colors as other graphs #Across Env slopes ggplot(Full_Slopes, aes(x = CV_Method, y = Slope)) + geom_boxplot(aes(fill = Type), alpha = 0.75) + geom_hline(yintercept = 1) + ylab('Across Environment Slope') + xlab('Sampling Method') + scale_x_discrete(labels = c("CV1" = "CV1", "CV2" = "CV2", "LOYO" = "LO1Y", "LORE" = "LORE", "LORH" = "LORH", "CV65" = "LO1E", "CV3" = "CV1 + \n LO1Y", "CV4" = "LORH + \n LO1Y", "CV6" = "CV1 + \n LORE", "CV5" = "LORH + \n LORE")) + theme_bw() + scale_fill_manual(values = c("#66C2A5", "#3288BD", "#5E4FA2"), labels = c("No GxE", "PCA(Env) * Markers", "PCA(Markers) * Env"), name = "Model") + theme(legend.title = element_text(face = "bold", size = 20, color = "black"), legend.text = element_text(size = 18, color = "black"), axis.text = element_text(size = 18, color = "black"), axis.title = element_text(face = "bold", size = 18, color = "black"), plot.background = element_rect(color = "white", fill = "white"), legend.background = element_rect(color = NA, fill = NA), legend.justification = c(0,0) , legend.position = c(0,0), legend.key = element_blank(), panel.grid = element_line(color = "gray60")) dev.off() #Env Mean Slopes ggplot(Env_Mean_Slopes, aes(x = CV_Method, y = Mean_Slope)) + geom_boxplot(aes(fill = Type), alpha = 0.75) + geom_hline(yintercept = 1, size = 1.2) + ylab('Within Environment Slope') + xlab('Sampling Method') + scale_x_discrete(labels = c("CV1" = "CV1", "CV2" = "CV2", "LOYO" = "LO1Y", "LORE" = "LORE", "LORH" = "LORH", "CV65" = "LO1E", "CV3" = "CV1 + \n LO1Y", "CV4" = "LORH + \n LO1Y", "CV6" = "CV1 + \n LORE", "CV5" = "LORH + \n LORE")) + theme_bw() + scale_fill_manual(values = c("#66C2A5", "#3288BD", "#5E4FA2"), labels = c("No GxE", "PCA(Env) * Markers", "PCA(Markers) * Env"), name = "Model") + theme(legend.title = element_text(face = "bold", size = 20, color = "black"), legend.text = element_text(size = 18, color = "black"), axis.text = element_text(size = 18, color = "black"), axis.title = element_text(face = "bold", size = 18, color = "black"), plot.background = element_rect(color = "white", fill = "white"), legend.background = element_rect(color = NA, fill = NA), legend.justification = c(0,0) , legend.position = c(0,0), legend.key = element_blank(), panel.grid = element_line(color = "gray60")) ``` #Corr between slope and prediction ability ```{r Across Env Pred Abi Plot} png(filename = paste0(OutDir, "Across_Env_Accuracy.png"), width = 1500, height = 800, units = "px") ggplot(tab_combined, aes(x = CV_Method, y = PredAbi, fill = Type)) + geom_boxplot(alpha = 0.75) + stat_boxplot(geom="errorbar") + scale_x_discrete(labels = c("CV1" = "CV1", "CV2" = "CV2", "LOYO" = "LO1Y", "LORE" = "LORE", "LORH" = "LORH", "CV65" = "LO1E", "CV3" = "CV1 + \n LO1Y", "CV4" = "LORH + \n LO1Y", "CV6" = "CV1 + \n LORE", "CV5" = "LORH + \n LORE")) + theme_bw() + scale_fill_manual(values = c("#66C2A5", "#3288BD", "#5E4FA2"), labels = c("No GxE", "PCA(Env) * Markers", "PCA(Markers) * Env"), name = "Model") + ylab("Across Environment Prediction Accuracy") + xlab("Sampling Method") + theme(legend.title = element_text(face = "bold", size = 20, color = "black"), legend.text = element_text(size = 18, color = "black"), axis.text = element_text(size = 18, color = "black"), axis.title = element_text(face = "bold", size = 18, color = "black"), plot.background = element_rect(color = "white", fill = "white"), legend.background = element_rect(color = "white", fill = "white"), panel.grid = element_line(color = "gray60")) dev.off() ``` ```{r LORE PLOT} ggplot(w_env_combined[w_env_combined$CV_Method == "LORE" , ], aes(x = as.character(fold), y = PredAbi, fill = Type)) + geom_boxplot(alpha = 0.75) + theme_bw() + scale_fill_manual(values = c("#66C2A5", "#3288BD", "#5E4FA2"), labels = c("No GxE", "PCA(Env) * Markers", "PCA(Markers) * Env"), name = "Model") + ylab("Within Environment Prediction Accuracy") + xlab("Environment Group") + theme(legend.title = element_text(face = "bold", size = 20, color = "black"), legend.text = element_text(size = 18, color = "black"), axis.text = element_text(size = 18, color = "black"), axis.title = element_text(face = "bold", size = 18, color = "black"), plot.background = element_rect(color = "white", fill = "white"), legend.background = element_rect(color = "white", fill = "white"), panel.grid = element_line(color = "gray60")) ``` ```{r} ggplot(w_env_combined[w_env_combined$CV_Method == "CV1" & w_env_combined$Type == "G + E", ], aes(x = Env, y = PredAbi, fill = Type)) + geom_boxplot() + theme_bw() #facet_wrap(~ CV_Method, ncol = 1) E_w_Env_combined <- w_env_combined %>% filter(Type == "G + E") %>% pivot_wider(names_from = "Type", id_cols = c("Env", "fold", "CV_Method", "model"), values_from = "PredAbi") GxE_w_Env_combined <- w_env_combined %>% filter(Type != "G + E") %>% mutate(Mod = "GxE") %>% pivot_wider(id_cols = c("Env", "fold", "CV_Method", "model", "Type"), values_from = "PredAbi", names_from = "Mod") W_env_diff <- GxE_w_Env_combined %>% left_join(y = E_w_Env_combined, by = c("Env", "fold", "CV_Method")) %>% mutate(Diff = `GxE` - `G + E`) w_env_diff_summ <- W_env_diff %>% group_by(CV_Method, Type) %>% summarise(mean_diff = mean(Diff, na.rm = TRUE)) %>% pivot_wider(names_from = Type, values_from = mean_diff) ## W_env_method_means <- w_env_combined %>% group_by(Env, CV_Method, Type) %>% summarise(mean_PredAbi = mean(PredAbi, na.rm = T), mean_Bias = mean(Bias, na.rm = T)) %>% left_join(y = distinct(select(w_env_combined, Env, H.mean))) Cor_Per_Method <- W_env_method_means %>% group_by(CV_Method, Type) %>% summarise(Cor_PAcc_H2 = cor(mean_PredAbi/H.mean, H.mean, use = "pairwise.complete"), Cor_PAcc_Bias = cor(mean_PredAbi/H.mean, mean_Bias, use = "pairwise.complete"), Cor_H2_Bias = cor(H.mean, mean_Bias, use = "pairwise.complete")) ggpairs(W_env_method_means[, -1], mapping = aes(fill = CV_Method)) Cor_Table <- Cor_Per_Method %>% pivot_wider(id_cols = CV_Method, values_from = Cor_PAcc_H2, names_from = Type) ``` ```{r Within Env Diff Boxplot} CV_Methods <- w_env_combined %>% select(CV_Method) %>% distinct() %>% arrange(CV_Method) %>% bind_cols(c("CV1", "CV2", "LO1Y", "LORE", "LORH", "LO1E","CV1 + \n LO1Y", "LORH + \n LO1Y", "CV1 + \n LORE", "LORH + \n LORE")) %>% rename("CV_Label" = "...2") ### Make this slightly prettier and loop through for(Method in CV_Methods$CV_Method){ cat(paste0("Working On Plot for ", Method, "."), "\n") Diff_Plot <- ggplot(W_env_diff[W_env_diff$CV_Method == Method, ], aes(x = Env, y = Diff, fill = Type)) + #geom_point() + geom_hline(yintercept = 0, color = "red") + geom_boxplot(alpha = 0.7) + scale_fill_manual(values = c("#3288BD", "#5E4FA2"), labels = c("PCA(Env) * Markers", "PCA(Markers) * Env"), name = "Model") + theme_bw() + theme(legend.title = element_text(face = "bold", size = 20, color = "black"), legend.text = element_text(size = 18, color = "black"), axis.text = element_text(size = 18, color = "black"), axis.title = element_text(face = "bold", size = 18, color = "black"), plot.background = element_rect(color = "white", fill = "white"), legend.background = element_rect(color = "white", fill = "white"), panel.grid = element_line(color = "gray60"), axis.text.x = element_text(angle = 90, hjust = 1)) + ylab(expression(Delta ~ "GxE - G + E")) + xlab("Env") + ggtitle(paste0(CV_Methods$CV_Label[CV_Methods$CV_Method == Method], " Within Env Diff Plot")) Diff_Plot ggsave(filename = paste0(OutDir, Method, "_DIFF_Boxplot.png"), width = 20, height = 8, units = "in") } ``` ```{r Diff Plots for LOYO and LORE} for(Method in c("LOYO", "LORE")){ cat(paste0("Working On Plot for ", Method, "."), "\n") Diff_Plot <- ggplot(W_env_diff[W_env_diff$CV_Method == Method, ], aes(x = as.character(fold), y = Diff, fill = Type)) + #geom_point() + geom_hline(yintercept = 0, color = "red") + geom_boxplot(alpha = 0.7) + scale_fill_manual(values = c("#3288BD", "#5E4FA2"), labels = c("PCA(Env) * Markers", "PCA(Markers) * Env"), name = "Model") + theme_bw() + theme(legend.title = element_text(face = "bold", size = 20, color = "black"), legend.text = element_text(size = 18, color = "black"), axis.text = element_text(size = 18, color = "black"), axis.title = element_text(face = "bold", size = 18, color = "black"), plot.background = element_rect(color = "white", fill = "white"), legend.background = element_rect(color = "white", fill = "white"), panel.grid = element_line(color = "gray60"), axis.text.x = element_text(angle = 90, hjust = 1)) + ylab(expression(Delta ~ "GxE - G + E")) + xlab("Fold") + ggtitle(label = paste0(CV_Methods$CV_Label[CV_Methods$CV_Method == Method], " Within Env Accuracy by Fold")) ggsave(filename = paste0(OutDir, Method, "ByFold_DIFF_Boxplot.png"), width = 20, height = 8, units = "in") } ``` ```{r Combined Bias H2 PredAbi Plots} for(Method in CV_Methods){ cat(paste0("Working on Plots for ", Method, "."), "\n") Bias_plot <- ggplot(combined_bias[combined_bias$CV_Method == Method, ], aes(x = Env, y = Type, fill = Bias)) + geom_tile(color = "black") + #guides(fill = FALSE) + ylab("Bias") + xlab("") + scale_fill_gradient2(low = "#D53E4F", mid = "white", high = "#3288DB", midpoint = 0) + #scale_fill_gradientn(colours = c("#FDAE61", "white", "#66C2A5"), # guide = "colourbar", breaks = c(-8,0,8), limits = c(-8,8)) + theme_bw() + theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(), axis.title = element_text(face = "bold", size = 18, color = "black"), axis.text = element_text(size = 18, color = "black"), axis.ticks.length.x = unit(0, "pt"), plot.margin = margin(0,0,0,0, "pt")) + scale_y_discrete(expand = c(0,0)) + scale_x_discrete(expand = c(0,0)) Heritability_Plot <- ggplot(Stage1_ModelingOut[Stage1_ModelingOut$Env %in% unique(w_env_combined$Env), ], aes(x = Env, y= 1, fill = H.mean)) + geom_tile(color = "black") + #guides(fill = FALSE) + scale_fill_gradientn(colours = c("white", "#3288BD"), breaks = c(0,1), guide = "colourbar", limits = c(0, 1), name = expression("h"^2)) + ylab(expression("h"^2)) + xlab("") + theme_bw() + theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(), axis.text.y = element_text(color = "white"), axis.ticks.y = element_blank(), axis.title = element_text(face = "bold", size = 18, color = "black"), #axis.text = element_text(size = 18, color = "black"), plot.margin = margin(0,0,0,0, "pt"), axis.ticks.length = unit(0, "pt")) + scale_y_discrete(expand = c(0, 0)) + scale_x_discrete(expand = c(0,0)) inset_stats <- data.frame() for(Models in c("G + E", "PC(Env)", "PC(Markers)")){ sub_df <- w_env_combined %>% filter(Type == Models, CV_Method == Method) %>% group_by(Env) %>% summarise(mean_h = mean(H.mean), mean_p = mean(PredAbi), mean_b = mean(Bias)) model_stats <- data.frame(t(c(Models, Method, cor(sub_df$mean_p, sub_df$mean_h, use = "pairwise.complete"), #Corr between Pred Abi and H.Mean cor(sub_df$mean_p, sub_df$mean_b, use = "pairwise.complete"), #Corr between Pred Abi and Bias cor(sub_df$mean_h, sub_df$mean_b, use = "pairwise.complete")))) #Cor between H.Mean and Bias names(model_stats) <- c("Type", "CV_Method", "Cor_PA_H2", "Cor_PA_Bias", "Cor_H2_Bias") model_stats[, -c(1:2)] <- as.numeric(model_stats[, -c(1:2)]) inset_stats <- rbind(inset_stats, model_stats) } PredAbi_plot <- ggplot(data = w_env_combined[w_env_combined$CV_Method == Method & w_env_combined$Type == "PC(Markers)", ], aes(x = Env, y = PredAbi, fill = H.mean)) + geom_boxplot(alpha = 0.95) + theme_bw() + scale_fill_gradientn(colours = c("white", "#02818a"), breaks = c(0,1), guide = "colourbar", limits = c(0, 1), name = expression("h"^2)) + theme(text = element_text(size = 18, color = "black"), axis.title = element_text(face = "bold", size = 18, color = "black"), axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), legend.title = element_text(size = 18, face = "bold", color = "black"), legend.text = element_text(size = 14, color = "black"), axis.text = element_text(size = 16, color = "black"), plot.background = element_rect(color = "white", fill = "white"), legend.background = element_rect(color = "white", fill = "white"), panel.grid = element_line(color = "gray60"), plot.margin = margin(0,0,0,0, "pt")) + ylab("PC(Markers) \n Predictive Ability") + xlab("Environment") + scale_x_discrete(expand = c(0.01,0.01)) + annotate(geom = "text", label = paste0("Cor(Pred Abi, H2) =", round(inset_stats$Cor_PA_H2[inset_stats$Type == "PC(Markers)" & inset_stats$CV_Method == Method], 3)), x = 59, y = min(w_env_combined$PredAbi[w_env_combined$CV_Method == Method & w_env_combined$Type == "PC(Markers)"]), hjust = 1, vjust = 0, size = 7) gridded_plots <- plot_grid(Bias_plot, PredAbi_plot, align = "v", nrow = 3, rel_heights = c(0.25, 1)) # png(filename = paste0(OutDir, "test_grid.png"), height = 1000, width = 1500, units = "px") # gridded_plots # dev.off() ggsave(plot = gridded_plots, filename = paste0(OutDir, Method, "_Grid.png"), height = 8, width = 16, units = "in") # grid_plots <- rbind(ggplotGrob(Bias_plot), ggplotGrob(Heritability_Plot), ggplotGrob(PredAbi_plot)) #png(filename = paste0(OutDir, Method, "_Test_Fig.png"), height = 1000, width = 1500, units = "px") # grid.newpage() # grid.arrange(rbind(ggplotGrob(Bias_plot), ggplotGrob(Heritability_Plot), ggplotGrob(PredAbi_plot))) # grid.arrange(Bias_plot, Heritability_Plot, PredAbi_plot, nrow = 3) # grid.draw(rbind(ggplotGrob(Bias_plot), ggplotGrob(Heritability_Plot), ggplotGrob(PredAbi_plot))) #dev.off() } ``` ```{r Bias Grid Plot} Type_lab <- c("G + E", "PCA(Env) * Markers", "PCA(Markers) * Env") CV_lab <- c("CV1" = "CV1", "CV2" = "CV2", "LOYO" = "LO1Y", "LORE" = "LORE", "LORH" = "LORH", "CV65" = "LO1E", "CV3" = "CV1 + \n LO1Y", "CV4" = "LORH + \n LO1Y", "CV6" = "CV1 + \n LORE", "CV5" = "LORH + \n LORE") names(Type_lab) <- unique(combined_bias$Type) combined_bias$CV_Method <- factor(combined_bias$CV_Method, levels = c("CV1", "CV2", "LOYO", "LORE", "LORH", "CV65", "CV3", "CV4", "CV6", "CV5")) Bias_Grid <- ggplot(combined_bias, aes(x = Env, y = 1, fill = Bias)) + facet_grid(CV_Method ~ Type, labeller = labeller(Type =Type_lab, CV_Method = CV_lab)) + geom_tile() + scale_fill_gradient2(low = "#D53E4F", mid = "white", high = "#3288DB", midpoint = 0) + theme_bw() + theme(text = element_text(size = 18, color = "black"), axis.title = element_text(face = "bold", size = 18, color = "black"), axis.title.y = element_blank(), axis.text.x = element_blank(), legend.title = element_text(size = 18, face = "bold", color = "black"), legend.text = element_text(size = 14, color = "black"), axis.text = element_blank(), plot.background = element_rect(color = "white", fill = "white"), legend.background = element_rect(color = "white", fill = "white"), panel.grid = element_line(color = "gray60"), plot.margin = margin(0,0,0,0, "pt"), strip.background = element_rect(fill = "white"), strip.text = element_text(face = "bold")) + scale_y_discrete(expand = c(0, 0)) Bias_Grid png(filename = paste0(OutDir, "Bias_Grid.png"), height = 1000, width = 1300, units = "px") Bias_Grid dev.off() Across_Env_Bias <- combined_bias %>% group_by(CV_Method, Type) %>% summarise(Mean_GEBV = mean(Mean_GEBV), Mean_Phe = mean(Mean_Phenotype)) %>% mutate(Bias = Mean_GEBV - Mean_Phe) ggplot(Across_Env_Bias, aes(x = CV_Method, y = Bias, color = Type)) + geom_point(size = 5) + theme_bw() + scale_color_manual(values = c("#66C2A5", "#3288BD", "#5E4FA2"), labels = c("No GxE", "PCA(Env) * Markers", "PCA(Markers) * Env"), name = "Model") + scale_x_discrete(labels = c("CV1" = "CV1", "CV2" = "CV2", "LOYO" = "LO1Y", "LORE" = "LORE", "LORH" = "LORH", "CV65" = "LO1E", "CV3" = "CV1 + \n LO1Y", "CV4" = "LORH + \n LO1Y", "CV6" = "CV1 + \n LORE", "CV5" = "LORH + \n LORE")) ``` ```{r Relationships between train and test sets environment and genomic} WorkDirectory <- "E:/My Drive/Anna/" AD_Cols_A <- read.csv(paste0(WorkDirectory, "Paper1_Figures_Code/AD_Cols_RC.csv")) AD_Cols_A <- read.csv(paste0(WorkDirectory, "Paper1_Figures_Code/AD_Cols_A.csv")) AD_Cols_RC <- read.csv(paste0(WorkDirectory, "Paper1_Figures_Code/AD_Cols_RC.csv")) AD_Cols <- bind_rows(AD_Cols_A, AD_Cols_RC) D_Mat <- read.table(file = "E:/My Drive/Anna/G2F_Hybrid_Genomic_Data_To_Analysis/G2FHybrids_DominanceRelationshipMatrix_a.txt", skip = 3, header = FALSE) row.names(D_Mat) <- D_Mat[,1] #Make the rownames the line names D_Mat <- D_Mat[,-1] #Remove the first column of information colnames(D_Mat) <- row.names(D_Mat) #Make the column and row names the same D_Mat <- as.matrix(D_Mat) #turn D_Mat into a matrix WeatherMat <- WeatherMat[rownames(WeatherMat) %in% unique(PhenoG2F$Env), colnames(WeatherMat) %in% unique(PhenoG2F$Env)] #For CV1 Compute_Env_Gen_Relationships <- function(n_folds = 10, fold_column, Direction = "Single"){ Gen_Tab <- {} Env_Tab <- {} for(i in 1:n_folds){ cat(paste0("Working on fold ", i, "."), "\n") if(Direction == "Single"){ Hybs_Test <- PhenoG2F %>% select(!!fold_column, Hybrid) %>% distinct() %>% filter(!!fold_column == i) Hybs_Train <- PhenoG2F %>% select(!!fold_column, Hybrid) %>% distinct() %>% filter(!!fold_column != i) test <- D_Mat[rownames(D_Mat) %in% Hybs_Test$Hybrid, colnames(D_Mat) %in% Hybs_Train$Hybrid] GenRel <- c(i, min(test), quantile(x = test, 0.25), median(test), mean(test), quantile(x = test, 0.75), max(test)) names(GenRel) <- c("fold", "min", "q25", "med", "mean", "q75", "max") Gen_Tab <- bind_rows(Gen_Tab, GenRel) Env_Test <- PhenoG2F %>% select(!!fold_column, Env) %>% distinct() %>% filter(!!fold_column == i) Env_Train <- PhenoG2F %>% select(!!fold_column, Env) %>% distinct() %>% filter(!!fold_column != i) env_mat <- as.matrix(WeatherMat[rownames(WeatherMat) %in% Env_Test$Env, colnames(WeatherMat) %in% Env_Train$Env]) if(ncol(env_mat) == 0){ env_mat <- WeatherMat[lower.tri(WeatherMat, diag = TRUE)] } EnvRel <- c(i, min(env_mat), quantile(x = env_mat, 0.25), median(env_mat), mean(env_mat), quantile(x = env_mat, 0.75), max(env_mat)) names(EnvRel) <- c("fold", "min", "q25", "med", "mean", "q75", "max") Env_Tab <- bind_rows(Env_Tab, EnvRel) } if(Direction == "Double"){ Hybs_Test <- PhenoG2F %>% select(!!fold_column, Hybrid) %>% distinct() %>% filter(!!fold_column == i) Hybs_Train <- PhenoG2F %>% select(!!fold_column, Hybrid) %>% distinct() %>% filter(!!fold_column != i, !(Hybrid %in% Hybs_Test$Hybrid)) test <- D_Mat[rownames(D_Mat) %in% Hybs_Test$Hybrid, colnames(D_Mat) %in% Hybs_Train$Hybrid] GenRel <- c(i, min(test), quantile(x = test, 0.25), median(test), mean(test), quantile(x = test, 0.75), max(test)) names(GenRel) <- c("fold", "min", "q25", "med", "mean", "q75", "max") Gen_Tab <- bind_rows(Gen_Tab, GenRel) Env_Test <- PhenoG2F %>% select(!!fold_column, Env) %>% distinct() %>% filter(!!fold_column == i) Env_Train <- PhenoG2F %>% select(!!fold_column, Env) %>% distinct() %>% filter(!!fold_column != i, !(Env %in% Env_Test$Env)) env_mat <- as.matrix(WeatherMat[rownames(WeatherMat) %in% Env_Test$Env, colnames(WeatherMat) %in% Env_Train$Env]) if(ncol(env_mat) == 0){ env_mat <- WeatherMat[lower.tri(WeatherMat, diag = TRUE)] } EnvRel <- c(i, min(env_mat), quantile(x = env_mat, 0.25), median(env_mat), mean(env_mat), quantile(x = env_mat, 0.75), max(env_mat)) names(EnvRel) <- c("fold", "min", "q25", "med", "mean", "q75", "max") Env_Tab <- bind_rows(Env_Tab, EnvRel) } } TT_Rels <- Gen_Tab %>% left_join(Env_Tab, by = "fold", suffix = c("_GEN", "_ENV")) return(TT_Rels) } CV1_TT_Rels <- Compute_Env_Gen_Relationships(fold_column = quo(RandHybFold), Direction = "Single") CV2_TT_Rels <- Compute_Env_Gen_Relationships(n_folds = 20, fold_column = quo(CV2_Fold_Num), Direction = "Single") CV3_TT_Rels <- Compute_Env_Gen_Relationships(n_folds = 30, fold_column = quo(CV3_Fold), Direction = "Double") CV4_TT_Rels <- Compute_Env_Gen_Relationships(n_folds = 28, fold_column = quo(CV4_Fold), Direction = "Double") CV5_TT_Rels <- Compute_Env_Gen_Relationships(n_folds = 68, fold_column = quo(CV5_Fold), Direction = "Double") CV6_TT_Rels <- Compute_Env_Gen_Relationships(n_folds = 70, fold_column = quo(CV6_Fold), Direction = "Double") LOYO_TT_Rels <- Compute_Env_Gen_Relationships(n_folds = 3, fold_column = quo(YearCluster), Direction = "Single") LORE_TT_Rels <- Compute_Env_Gen_Relationships(n_folds = 7, fold_column = quo(EnvClusters), Direction = "Single") LORH_TT_Rels <- Compute_Env_Gen_Relationships(n_folds = 10, fold_column = quo(WardClusters), Direction = "Single") CV65_TT_Rels <- Compute_Env_Gen_Relationships(n_folds = 59, fold_column = quo(CV65), Direction = "Single") if(exists("All_TT_Rels")){ rm(All_TT_Rels) } All_TT_Rels <- bind_rows(mget(ls(pattern='_TT_Rels')), .id='Source') %>% mutate(Source = str_remove(Source, pattern = "_TT_Rels")) All_TT_Rels <- All_TT_Rels %>% rename(CV_Method = Source)%>% left_join(tab_combined) %>% left_join(CV_Methods) TT_Means <- All_TT_Rels %>% filter(!is.na(Type), Type == "PC(Markers)") %>% group_by(CV_Method, CV_Label, Type) %>% summarise(Mean_Gen = mean(mean_GEN, na.rm = TRUE), Mean_Env = mean(mean_ENV, na.rm = TRUE), mean_predabi = mean(PredAbi, na.rm = TRUE)) ATT_Means <- All_TT_Rels %>% group_by(CV_Method, CV_Label, Type) %>% summarise(Mean_Gen = mean(mean_GEN, na.rm = TRUE), Mean_Env = mean(mean_ENV, na.rm = TRUE), mean_predabi = mean(PredAbi, na.rm = TRUE)) Try1 <- All_TT_Rels %>% select(CV_Label, n_trn, n_test, fold, ends_with("_GEN"), ends_with("_ENV")) %>% distinct() write.csv(Try1, file = paste0(OutDir, "Table_S2_Train_Test_Relationships.csv"), quote = FALSE, row.names = FALSE) #Compute for both across and within pred abi #ANOVA for just cv65 png(filename = paste0(OutDir, "Env_Gen_Train_Test_Cov.png"), width = 600, height = 400, units = "px") ggplot(TT_Means, aes(x = Mean_Env, y = Mean_Gen, fill = mean_predabi, label = CV_Label)) + geom_point(aes(size = 5, fill = mean_predabi, colour = mean_predabi), pch = 21, colour = "black", stroke = 1, alpha = 0.8, position = "jitter") + theme_bw() + #scale_fill_manual(values = c("#9E0142", "#D53E4F", "#F46D43", "#FDAE61", # "#FEE08B", "#E6F598", "#ABDDA4", "#66C2A5", # "#3288BD", "#5E4FA2")) + scale_fill_gradient(low = "#ABDDA4", high = "#02818a", name = "Mean Across \n Environment \n Predictive Ability") + geom_hline(yintercept = 0, color = "red") + geom_vline(xintercept = 0, color = "red") + ggrepel::geom_text_repel() + xlab("Mean Environmental Cov(Train, Test)") + ylab("Mean Dominance Genetic \n Relationship(Train, Test)") + theme(text = element_text(size = 18, color = "black"), axis.title = element_text(face = "bold", size = 18, color = "black"), legend.title = element_text(size = 18, face = "bold", color = "black"), legend.text = element_text(size = 14, color = "black"), plot.background = element_rect(color = "white", fill = "white"), legend.background = element_rect(color = "white", fill = "white"), panel.grid = element_line(color = "gray60")) + guides(size = FALSE) dev.off() ``` AD_Cols[(AD_Cols_A$Hyb1 %in% Hyb_sub$Hybrid & !(AD_Cols_A$Hyb2 %in% Hyb_sub$Hybrid)) | (AD_Cols_A$Hyb2 %in% Hyb_sub$Hybrid & !(AD_Cols_A$Hyb1 %in% Hyb_sub$Hybrid)), ] #Make sure to account for sparse format of column matrix. ```{r} ```