# R Script: BGLR_CVYear_Out_Script.R # G2F prediction cross validation loop for method CV65: Leave one environment out # Author: Anna R. Rogers # Usage: # Rscript BGLR_CV65_Script.R [#i] #Read in argparse to ensure all arguments needed have been collected library(argparse) #Create the parser parser <- ArgumentParser(description = "Modeling G2F Cross Validation and Model types across multiple CV methods") parser$add_argument("--i", default = 1, help = "Fold number to execute. If both i and j are used, this refers to the hybrid fold.") parser$add_argument("--model", help = "What type of Model to run, ex. Full_GxE_Dim_Reduction") parser$add_argument("--outdir", help = "Directory for output of results") parser$add_argument("--cv_method", help = "What type of Cross Validation to run") # Read in the arguments args <- parser$parse_args() #Give an Error if too many or too few arguments argslen <- length(args) if (argslen != 4) stop("Error: Too many or too few arguments, expecting four arguments.") #Define the arguments to go into the looping structure itself i <- args$i #L3 addition: cat the arguments to standard out cat("Command line argument i = ", i, "\n", sep="") # Cross Validation Schematics and looping structure WorkDirectory <- "/gpfs_common/share02/holland/arroger4/Prediction_G2F/" Run_Dir <- WorkDirectory #WorkDirectory <-"E:/My Drive/Anna/Prediction_G2F/" #setwd(WorkDirectory) #Load Libraries #library(doParallel) #library(foreach) library(BGLR) library(Matrix) library(MASS) library(tidyr) library(dplyr) library(ggplot2) source(file = "BGLR_TauDistPatch.txt") #Load Workspace #load(file = paste(WorkDirectory, "BGLR_Space_GxE_Matrix.RData", sep = "")) WorkDirectory <- Run_Dir #Ensure working directory is not overwritten by incoming Rdata files. #Get everything ready to do in parallel: #Since we are doing multiple job submissions be don't need to do this part, because we won't be running anything in parallel #cl <- makeCluster(1) #registerDoParallel(cl) #Make the ETA list to go into the bglr model #### Loading Data #### #Section for Baseline Modeling with different number of A matrix markers if (args$model == "A_20k"){ load(paste0(WorkDirectory, "R_data_objects/A_Matrix_20k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_AVG.Rdata")) #Load Blues averaged across Envs load(paste0(WorkDirectory, "R_data_objects/Hyb_Blues_Avg.Rdata")) weights <- NULL eta <- list(Geno = list(X = A_Markers[, -1], model = "BRR")) print(paste0("Do Marker Matrix Names Align with Phenotype names?", all(A_Markers$Hybrid == attr(Yield, "Hybrid")))) print("Running Baseline Model on Blues avergaged across environments using 20k Additive Marker Matrix") } else if (args$model == "A_15k"){ load(paste0(WorkDirectory, "R_data_objects/A_Matrix_15k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_AVG.Rdata")) #Load Blues averaged across Envs load(paste0(WorkDirectory, "R_data_objects/Hyb_Blues_Avg.Rdata")) weights <- NULL eta <- list(Geno = list(X = A_Markers_15k[, -1], model = "BRR")) print(paste0("Do Marker Matrix Names Align with Phenotype names?", all(A_Markers_15k$Hybrid == attr(Yield, "Hybrid")))) print("Running Baseline Model on Blues avergaged across environments using 15k Additive Marker Matrix") } else if (args$model == "A_10k"){ load(paste0(WorkDirectory, "R_data_objects/A_Matrix_10k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_AVG.Rdata")) #Load Blues averaged across Envs load(paste0(WorkDirectory, "R_data_objects/Hyb_Blues_Avg.Rdata")) weights <- NULL eta <- list(Geno = list(X = A_Markers_10k[, -1], model = "BRR")) print(paste0("Do Marker Matrix Names Align with Phenotype names?", all(A_Markers_10k$Hybrid == attr(Yield, "Hybrid")))) print("Running Baseline Model on Blues avergaged across environments using 10k Additive Marker Matrix") } else if (args$model == "A_5k"){ load(paste0(WorkDirectory, "R_data_objects/A_Matrix_5k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_AVG.Rdata")) #Load Blues averaged across Envs load(paste0(WorkDirectory, "R_data_objects/Hyb_Blues_Avg.Rdata")) weights <- NULL eta <- list(Geno = list(X = A_Markers_5k[, -1], model = "BRR")) print(paste0("Do Marker Matrix Names Align with Phenotype names?", all(A_Markers_5k$Hybrid == attr(Yield, "Hybrid")))) print("Running Baseline Model on Blues avergaged across environments using 5k Additive Marker Matrix") #Section for Baseline Modeling with different number of D matrix markers }else if (args$model == "D_20k"){ load(paste0(WorkDirectory, "R_data_objects/D_Matrix_20k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_AVG.Rdata")) #Load Blues averaged across Envs load(paste0(WorkDirectory, "R_data_objects/Hyb_Blues_Avg.Rdata")) weights <- NULL eta <- list(Geno = list(X = D_Markers[, -1], model = "BRR")) print(paste0("Do Marker Matrix Names Align with Phenotype names?", all(D_Markers$Hybrid == attr(Yield, "Hybrid")))) print("Running Baseline Model on Blues avergaged across environments using 20k Dominance Marker Matrix") } else if (args$model == "D_15k"){ load(paste0(WorkDirectory, "R_data_objects/D_Matrix_15k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_AVG.Rdata")) #Load Blues averaged across Envs load(paste0(WorkDirectory, "R_data_objects/Hyb_Blues_Avg.Rdata")) weights <- NULL eta <- list(Geno = list(X = D_Markers_15k[, -1], model = "BRR")) print(paste0("Do Marker Matrix Names Align with Phenotype names?", all(D_Markers_15k$Hybrid == attr(Yield, "Hybrid")))) print("Running Baseline Model on Blues avergaged across environments using 15k Dominance Marker Matrix") } else if (args$model == "D_10k"){ load(paste0(WorkDirectory, "R_data_objects/D_Matrix_10k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_AVG.Rdata")) #Load Blues averaged across Envs load(paste0(WorkDirectory, "R_data_objects/Hyb_Blues_Avg.Rdata")) weights <- NULL eta <- list(Geno = list(X = D_Markers_10k[, -1], model = "BRR")) print(paste0("Do Marker Matrix Names Align with Phenotype names?", all(D_Markers_10k$Hybrid == attr(Yield, "Hybrid")))) print("Running Baseline Model on Blues avergaged across environments using 10k Dominance Marker Matrix") } else if (args$model == "D_5k"){ load(paste0(WorkDirectory, "R_data_objects/D_Matrix_5k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_AVG.Rdata")) #Load Blues averaged across Envs load(paste0(WorkDirectory, "R_data_objects/Hyb_Blues_Avg.Rdata")) weights <- NULL eta <- list(Geno = list(X = D_Markers_5k[, -1], model = "BRR")) print(paste0("Do Marker Matrix Names Align with Phenotype names?", all(D_Markers_5k$Hybrid == attr(Yield, "Hybrid")))) print("Running Baseline Model on Blues avergaged across environments using 5k Dominance Marker Matrix") #Run Baseline Models with A and D Matrices (only run on same number of markers for each -not 15k A and 5k D) }else if (args$model == "A+D_20k"){ load(paste0(WorkDirectory, "R_data_objects/A_Matrix_20k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/D_Matrix_20k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_AVG.Rdata")) #Load Blues averaged across Envs load(paste0(WorkDirectory, "R_data_objects/Hyb_Blues_Avg.Rdata")) weights <- NULL eta <- list(Additive = list(X = A_Markers[, -1], model = "BRR"), Dominance = list(X = D_Markers[, -1], model = "BRR")) print(paste0("Do A Marker Matrix Names Align with Phenotype names?", all(A_Markers$Hybrid == attr(Yield, "Hybrid")))) print(paste0("Do D Marker Matrix Names Align with Phenotype names?", all(D_Markers$Hybrid == attr(Yield, "Hybrid")))) print("Running Baseline Model on Blues avergaged across environments using 20k A and D Marker Matrices") } else if (args$model == "A+D_15k"){ load(paste0(WorkDirectory, "R_data_objects/A_Matrix_15k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/D_Matrix_15k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_AVG.Rdata")) #Load Blues averaged across Envs load(paste0(WorkDirectory, "R_data_objects/Hyb_Blues_Avg.Rdata")) weights <- NULL eta <- list(Additive = list(X = A_Markers_15k[, -1], model = "BRR"), Dominance = list(X = D_Markers_15k[, -1], model = "BRR")) print(paste0("Do A Marker Matrix Names Align with Phenotype names?", all(A_Markers_15k$Hybrid == attr(Yield, "Hybrid")))) print(paste0("Do D Marker Matrix Names Align with Phenotype names?", all(D_Markers_15k$Hybrid == attr(Yield, "Hybrid")))) print("Running Baseline Model on Blues avergaged across environments using 15k A and D Marker Matrices") } else if (args$model == "A+D_10k"){ load(paste0(WorkDirectory, "R_data_objects/A_Matrix_10k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/D_Matrix_10k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_AVG.Rdata")) #Load Blues averaged across Envs load(paste0(WorkDirectory, "R_data_objects/Hyb_Blues_Avg.Rdata")) weights <- NULL eta <- list(Additive = list(X = A_Markers_10k[, -1], model = "BRR"), Dominance = list(X = D_Markers_10k[, -1], model = "BRR")) print(paste0("Do A Marker Matrix Names Align with Phenotype names?", all(A_Markers_10k$Hybrid == attr(Yield, "Hybrid")))) print(paste0("Do D Marker Matrix Names Align with Phenotype names?", all(D_Markers_10k$Hybrid == attr(Yield, "Hybrid")))) print("Running Baseline Model on Blues avergaged across environments using 10k A and D Marker Matrices") } else if (args$model == "A+D_5k"){ load(paste0(WorkDirectory, "R_data_objects/A_Matrix_5k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/D_Matrix_5k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_AVG.Rdata")) #Load Blues averaged across Envs weights <- NULL load(paste0(WorkDirectory, "R_data_objects/Hyb_Blues_Avg.Rdata")) eta <- list(Additive = list(X = A_Markers_5k[, -1], model = "BRR"), Dominance = list(X = D_Markers_5k[, -1], model = "BRR")) print(paste0("Do A Marker Matrix Names Align with Phenotype names?", all(A_Markers_5k$Hybrid == attr(Yield, "Hybrid")))) print(paste0("Do D Marker Matrix Names Align with Phenotype names?", all(D_Markers_5k$Hybrid == attr(Yield, "Hybrid")))) print("Running Baseline Model on Blues avergaged across environments using 5k A and D Marker Matrices") #Run weighted model sets }else if (args$model == "D_20k_W"){ load(paste0(WorkDirectory, "R_data_objects/D_Matrix_20k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_AVG.Rdata")) #Load Blues averaged across Envs load(paste0(WorkDirectory, "R_data_objects/Hyb_Blues_Avg.Rdata")) #load Cross val info load(paste0(WorkDirectory, "R_data_objects/Weights_Yield_AVG.Rdata")) #Load Weights eta <- list(Geno = list(X = D_Markers[, -1], model = "BRR")) print("Running Weighted Baseline Model on Blues avergaged across environments using 20k Dominance Marker Matrix") } else if (args$model == "D_15k_W"){ load(paste0(WorkDirectory, "R_data_objects/D_Matrix_15k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_AVG.Rdata")) #Load Blues averaged across Envs load(paste0(WorkDirectory, "R_data_objects/Hyb_Blues_Avg.Rdata")) load(paste0(WorkDirectory, "R_data_objects/Weights_Yield_AVG.Rdata")) #Load Weights eta <- list(Geno = list(X = D_Markers_15k[, -1], model = "BRR")) print("Running Weighted Baseline Model on Blues avergaged across environments using 15k Dominance Marker Matrix") } else if (args$model == "D_10k_W"){ load(paste0(WorkDirectory, "R_data_objects/D_Matrix_10k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_AVG.Rdata")) #Load Blues averaged across Envs load(paste0(WorkDirectory, "R_data_objects/Hyb_Blues_Avg.Rdata")) load(paste0(WorkDirectory, "R_data_objects/Weights_Yield_AVG.Rdata")) #Load Weights eta <- list(Geno = list(X = D_Markers_10k[, -1], model = "BRR")) print("Running Weighted Baseline Model on Blues avergaged across environments using 10k Dominance Marker Matrix") } else if (args$model == "D_5k_W"){ load(paste0(WorkDirectory, "R_data_objects/D_Matrix_5k.Rdata")) #Load the A Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_AVG.Rdata")) #Load Blues averaged across Envs load(paste0(WorkDirectory, "R_data_objects/Hyb_Blues_Avg.Rdata")) load(paste0(WorkDirectory, "R_data_objects/Weights_Yield_AVG.Rdata")) #Load Weights eta <- list(Geno = list(X = D_Markers_5k[, -1], model = "BRR")) print("Running Weighted Baseline Model on Blues avergaged across environments using 5k Dominance Marker Matrix") # Environment Main Effect Modeling } else if (args$model == "D_10k_EnvLabel"){ load(paste0(WorkDirectory, "R_data_objects/D_Markers_Full.Rdata")) #Load the D Matrix load(paste0(WorkDirectory, "R_data_objects/Yield_Vector.Rdata")) #Load the Yield Vector load(paste0(WorkDirectory, "R_data_objects/Env_Indicator.Rdata")) weights <- NULL load(paste0(WorkDirectory, "R_data_objects/PhenoG2F.Rdata")) #Load the PhenoG2F Dataframe eta <- list(Geno = list(X = D_Markers_Full[, -1], model = "BRR"), Env = list(X = Env_Indicator, model = "FIXED")) #Set up the ETA list print(paste0("Do D Marker Matrix Names Align with Phenotype names?", all(D_Markers_Full$Hybrid == attr(Yield, "Hybrid")))) print("Running Model on Full Dataset with 10k Dominance Markers and Env Labels") } else if (args$model == "D_10k_5Day"){ load(paste0(WorkDirectory, "R_data_objects/D_Markers_Full.Rdata")) #Load the D Matrix load(paste0(WorkDirectory, "R_data_objects/Full_Weather_5Day.Rdata")) #Load 5 day Full weather load(paste0(WorkDirectory, "R_data_objects/Yield_Vector.Rdata")) #Load the Yield Vector eta <- list(Geno = list(X = D_Markers_Full[, -1], model = "BRR"), Env = list(X = Full_Weather_5Day[, -c(1:6)], model = "BL")) #Set up the ETA list weights <- NULL load(paste0(WorkDirectory, "R_data_objects/PhenoG2F.Rdata")) #Load the PhenoG2F Dataframe print(paste0("Do D Marker Matrix Names Align with Phenotype names?", all(D_Markers_Full$Hybrid == attr(Yield, "Hybrid")))) print(paste0("Does Env data order match Yield Env order?", all(Full_Weather_5Day$Env == attr(Yield, "Env")))) print("Running Model on Full Dataset with 10k Dominance Markers and 5 Day Environmental Covariates") } else if (args$model == "D_10k_10Day"){ load(paste0(WorkDirectory, "R_data_objects/D_Markers_Full.Rdata")) #Load the D Matrix load(paste0(WorkDirectory, "R_data_objects/Full_Weather_10Day.Rdata")) #Load 10 day Full weather load(paste0(WorkDirectory, "R_data_objects/Yield_Vector.Rdata")) eta <- list(Geno = list(X = D_Markers_Full[, -1], model = "BRR"), Env = list(X = Full_Weather_10Day[, -c(1:6)], model = "BL")) weights <- NULL load(paste0(WorkDirectory, "R_data_objects/PhenoG2F.Rdata")) #Load the PhenoG2F Dataframe print(paste0("Do D Marker Matrix Names Align with Phenotype names?", all(D_Markers_Full$Hybrid == attr(Yield, "Hybrid")))) print(paste0("Does Env data order match Yield Env order?", all(Full_Weather_10Day$Env == attr(Yield, "Env")))) print("Running Model on Full Dataset with 10k Dominance Markers and 10 Day Environmental Covariates") } else if (args$model == "D_10k_15Day"){ load(paste0(WorkDirectory, "R_data_objects/D_Markers_Full.Rdata")) #Load the D Matrix load(paste0(WorkDirectory, "R_data_objects/Full_Weather_15Day.Rdata")) #Load 15 day Full weather load(paste0(WorkDirectory, "R_data_objects/Yield_Vector.Rdata")) eta <- list(Geno = list(X = D_Markers_Full[, -1], model = "BRR"), Env = list(X = Full_Weather_15Day[, -c(1:6)], model = "BL")) weights <- NULL load(paste0(WorkDirectory, "R_data_objects/PhenoG2F.Rdata")) #Load the PhenoG2F Dataframe print(paste0("Do D Marker Matrix Names Align with Phenotype names?", all(D_Markers_Full$Hybrid == attr(Yield, "Hybrid")))) print(paste0("Does Env data order match Yield Env order?", all(Full_Weather_15Day$Env == attr(Yield, "Env")))) print("Running Model on Full Dataset with 10k Dominance Markers and 15 Day Environmental Covariates") } else if (args$model == "D_10k_30Day"){ load(paste0(WorkDirectory, "R_data_objects/D_Markers_Full.Rdata")) #Load the D Matrix load(paste0(WorkDirectory, "R_data_objects/Full_Weather_30Day.Rdata")) #Load 30 day Full weather load(paste0(WorkDirectory, "R_data_objects/Yield_Vector.Rdata")) eta <- list(Geno = list(X = D_Markers_Full[, -1], model = "BRR"), Env = list(X = Full_Weather_30Day[, -c(1:6)], model = "BL")) weights <- NULL load(paste0(WorkDirectory, "R_data_objects/PhenoG2F.Rdata")) #Load the PhenoG2F Dataframe print(paste0("Do D Marker Matrix Names Align with Phenotype names?", all(D_Markers_Full$Hybrid == attr(Yield, "Hybrid")))) print(paste0("Does Env data order match Yield Env order?", all(Full_Weather_30Day$Env == attr(Yield, "Env")))) print("Running Model on Full Dataset with 10k Dominance Markers and 30 Day Environmental Covariates") # Full GxE Modeling } else if (args$model == "Full_GxE_Dim_Reduction"){ load(paste0(WorkDirectory, "R_data_objects/D_Markers_Full.Rdata")) #Load the D Matrix load(paste0(WorkDirectory, "R_data_objects/Full_Weather_5Day.Rdata")) #Load 5 day Full weather load(paste0(WorkDirectory, "R_data_objects/Yield_Vector.Rdata")) #Load the Yield Vector eta <- list(Geno = list(X = D_Markers_Full[, -1], model = "BRR"), Env = list(X = Full_Weather_5Day[, -c(1:6)], model = "BL")) #Set up the ETA list weights <- NULL load(paste0(WorkDirectory, "R_data_objects/PhenoG2F.Rdata")) #Load the PhenoG2F Dataframe cat(paste0("Do D Marker Matrix Names Align with Phenotype names?", "\n", all(D_Markers_Full$Hybrid == attr(Yield, "Hybrid")))) cat(paste0("Does Env data order match Yield Env order?", "\n", all(Full_Weather_5Day$Env == attr(Yield, "Env")))) cat(paste0("Running Full GxE Model on Full Dataset with 10k Dominance Markers and 5 Day Environmental Covariates", "\n", "Using Principle Components for Dimensionality Reduction.")) } else if (args$model == "Full_GxE_Feat_Selection"){ load(paste0(WorkDirectory, "R_data_objects/D_Markers_Full.Rdata")) #Load the D Matrix load(paste0(WorkDirectory, "R_data_objects/Full_Weather_5Day.Rdata")) #Load 5 day Full weather load(paste0(WorkDirectory, "R_data_objects/Yield_Vector.Rdata")) #Load the Yield Vector eta <- list(Geno = list(X = D_Markers_Full[, -1], model = "BRR"), Env = list(X = Full_Weather_5Day[, -c(1:6)], model = "BL")) #Set up the ETA list weights <- NULL load(paste0(WorkDirectory, "R_data_objects/PhenoG2F.Rdata")) #Load the PhenoG2F Dataframe cat(paste0("Do D Marker Matrix Names Align with Phenotype names? ", all(D_Markers_Full$Hybrid == attr(Yield, "Hybrid"))), "\n") cat(paste0("Does Env data order match Yield Env order? ", all(Full_Weather_5Day$Env == attr(Yield, "Env"))), "\n") cat(paste0("Running Full GxE Model on Full Dataset with:", "\n", "10k Dominance Markers", "\n", "and", "\n", "5 Day Environmental Covariates", "\n", "Using Feature Selection.")) # } else if(args$model == "Baseline"){ # load(paste0(WorkDirectory, )) # eta <- list(Geno = list(X = Full_Markers[, -1], model = "BRR"), Env = list(X = PhenoG2F$Env, model = "BL"), GxE = list(X = Full_Markers:PhenoG2F$Env, model = "BL")) # print("Running Baseline: Additive Marker Matrix + Env Labels + Interaction Between.") # } else if(args$model == "Dominance"){ # eta <- list(Geno = list(X = Full_Dom[, -1], model = "BRR"), Env = list(X = PhenoG2F$Env, model = "BL"), GxE = list(X = Full_Dom:PhenoG2F$Env, model = "BL")) # print("Running Dominance: Dominance Marker Matrix + Env Labels + Interaction Between.") # } else if(args$model == "A_D"){ # eta <- list(Additive = list(X = Full_Markers[, -1], model = "BRR"), Dominance = list(X = Full_Dom[, -1], model = "BRR"), Env = list(X = PhenoG2F$Env, model = "BL"), AxE = list(X = Full_Markers:PhenoG2F$Env, model = "BL"), DxE = list(X = Full_Dom:PhenoG2F$Env, model = "BL")) # print("Running A + D Model: Additive and Dominance Marker Matrice + Env Labels + Interaction Between.") # } else if(args$model == "5 Day"){ # eta <- list(Geno = list(X = Full_Markers[, -1], model = "BRR"), Dominance = list(X = Full_Dom[, -1], model = "BRR"), Env = list(X = PhenoG2F$Env, model = "BL"), AxE = list(X = Full_Markers:PhenoG2F$Env, model = "BL"), DxE = list(X = Full_Dom:PhenoG2F$Env, model = "BL")) # print("Running A + D Model: Additive and Dominance Marker Matrice + Env Labels + Interaction Between.") } else if (args$model == "Full_GxE_PC_G"){ load(paste0(WorkDirectory, "R_data_objects/D_Markers_Full.Rdata")) #Load the D Matrix load(paste0(WorkDirectory, "R_data_objects/Full_Weather_5Day.Rdata")) #Load 5 day Full weather load(paste0(WorkDirectory, "R_data_objects/Yield_Vector.Rdata")) #Load the Yield Vector eta <- list(Geno = list(X = D_Markers_Full[, -1], model = "BRR"), Env = list(X = Full_Weather_5Day[, -c(1:6)], model = "BL")) #Set up the ETA list weights <- NULL load(paste0(WorkDirectory, "R_data_objects/PhenoG2F.Rdata")) #Load the PhenoG2F Dataframe cat(paste0("Do D Marker Matrix Names Align with Phenotype names?", "\n", all(D_Markers_Full$Hybrid == attr(Yield, "Hybrid")))) cat(paste0("Does Env data order match Yield Env order?", "\n", all(Full_Weather_5Day$Env == attr(Yield, "Env")))) cat(paste0("Running Full GxE Model on Full Dataset with 10k Dominance Markers and 5 Day Environmental Covariates", "\n", "Using Principle Components for Dimensionality Reduction.")) } else { print("Incorrect Model Specification Given") } #### Set Number of Iterations and Burn in #### niter <- 12000 burn <- 1000 WorkDirectory <- Run_Dir #Make sure the work directory is set to the correct space after loading in all workspaces. print(WorkDirectory) outdir <- args$outdir print(outdir) ### CV1 Type Loop: Leave out a subset of hybrids in all environments #This is like predicting a new line that hasn't been tested in any of the environments #Set up to run the loop, we need some empty frames to put things in CV_Out <- {} W_Env <- {} tab_results <- {} start_time <- proc.time() #For now we are commenting out the doParallel part of the model and the foreach part. If you do run the foreach part, then you need to use .packages because each core needs to be able to load the packages used in the loop. #foreach (i = 1:length(unique(PhenoG2F$RandHybFold)), .packages = c("BGLR", "Matrix", "tidyr", "dplyr")) %dopar% { #Source the BGLR Tau Distribution patch source(file = paste(WorkDirectory, "BGLR_TauDistPatch.txt", sep = ""), local = TRUE) #### Set CV Method #### #Determine CV method to set up NA values if (args$cv_method == "CV1"){ #Leave out 10% of Hybrids at random ####Set up the NA values for CV #### print("Running CV1 Cross Validation.\n") whichNA <- which(PhenoG2F$RandHybFold == i) #Which rows are in the test set yNA <- Yield #Grab the data to convert to NA since we don't want to overwrite the Yield data yNA[whichNA, 1] <- NA #Put the test set as NA, but make sure the weights are not set to NA as they are needed for computation of estimates. } else if (args$cv_method == "CV2"){ #Leave out 10% of Hybrids at half the locations that they are planted in. print("Running CV2 Cross Validation, Burgueno method (2012) \n") whichNA <- which(PhenoG2F$CV2_Fold_Num == i) yNA <- Yield #Grab the data to convert to NA since we don't want to overwrite the Yield data yNA[whichNA, 1] <- NA #Put the test set as NA, but make sure the weights are not set to NA as they are needed for computation of estimates. } else if (args$cv_method == "CV3"){ #Leave out 10% of Hybrids and 1 Year of Environments print("Running CV3 Cross Validation, Leave out 10% of hybrids and one year of environments \n") Leave_Out <- PhenoG2F %>% select(RandHybFold, YearCluster, CV3_Fold) %>% distinct() whichNA <- which(PhenoG2F$RandHybFold == Leave_Out$RandHybFold[Leave_Out$CV3_Fold == i] | PhenoG2F$YearCluster == Leave_Out$YearCluster[Leave_Out$CV3_Fold == i]) yNA <- Yield #Grab the data to convert to NA since we don't want to overwrite the Yield data yNA[whichNA, 1] <- NA #Put the test set as NA, but make sure the weights are not set to NA as they are needed for computation of estimates. } else if (args$cv_method == "CV4"){ #Leave out set of related Hybrids and one year of environments print("Running CV4 Cross Validation, leave out set of related hybrids from PCA clustering and one year of environments \n") Leave_Out <- PhenoG2F %>% select(WardClusters, YearCluster, CV4_Fold) %>% distinct() whichNA <- which(PhenoG2F$WardClusters == Leave_Out$WardClusters[Leave_Out$CV4_Fold == i] | PhenoG2F$YearCluster == Leave_Out$YearCluster[Leave_Out$CV4_Fold == i]) yNA <- Yield #Grab the data to convert to NA since we don't want to overwrite the Yield data yNA[whichNA, 1] <- NA #Put the test set as NA, but make sure the weights are not set to NA as they are needed for computation of estimates. } else if (args$cv_method == "CV5"){ #Leave out Clusters of Envs and Hybrids print("Running CV5 Cross Validation, leave out a set of related hybrids from PCA and related set of environments from env clustering \n") Leave_Out <- PhenoG2F %>% select(WardClusters, EnvClusters, CV5_Fold) %>% distinct() whichNA <- which(PhenoG2F$WardClusters == Leave_Out$WardClusters[Leave_Out$CV5_Fold == i] | PhenoG2F$EnvClusters == Leave_Out$EnvClusters[Leave_Out$CV5_Fold == i]) yNA <- Yield #Grab the data to convert to NA since we don't want to overwrite the Yield data yNA[whichNA, 1] <- NA #Put the test set as NA, but make sure the weights are not set to NA as they are needed for computation of estimates. } else if (args$cv_method == "CV6"){ #Leave out cluster of Envs and 10% randomm hybrids print("Running CV6 Cross Validation, leave out 10% random hybrids and a related set of envirnoments from env clustering \n") Leave_Out <- PhenoG2F %>% select(RandHybFold, EnvClusters, CV6_Fold) %>% distinct() whichNA <- which(PhenoG2F$RandHybFold == Leave_Out$RandHybFold[Leave_Out$CV6_Fold == i] | PhenoG2F$EnvClusters == Leave_Out$EnvClusters[Leave_Out$CV6_Fold == i]) yNA <- Yield #Grab the data to convert to NA since we don't want to overwrite the Yield data yNA[whichNA, 1] <- NA #Put the test set as NA, but make sure the weights are not set to NA as they are needed for computation of estimates. } else if (args$cv_method == "LOYO"){ #Leave One Year out print("Running Leave One Year out Cross Validation.") whichNA <- which(PhenoG2F$YearCluster == i) yNA <- Yield #Grab the data to convert to NA since we don't want to overwrite the Yield data yNA[whichNA, 1] <- NA #Put the test set as NA, but make sure the weights are not set to NA as they are needed for computation of estimates. } else if (args$cv_method == "LORE"){ # print("Running CV on Env Clusters") whichNA <- which(PhenoG2F$EnvClusters == i) yNA <- Yield #Grab the data to convert to NA since we don't want to overwrite the Yield data yNA[whichNA, 1] <- NA #Put the test set as NA, but make sure the weights are not set to NA as they are needed for computation of estimates. } else if (args$cv_method == "LORH"){ # print("Running CV on Hybrid PCA Clusters") whichNA <- which(PhenoG2F$WardClusters == i) yNA <- Yield #Grab the data to convert to NA since we don't want to overwrite the Yield data yNA[whichNA, 1] <- NA #Put the test set as NA, but make sure the weights are not set to NA as they are needed for computation of estimates. } else if (args$cv_method == "CV65"){ # print("Running Leave one Environment out Cross Validation (LOOCV)") whichNA <- which(PhenoG2F$CV65 == i) yNA <- Yield #Grab the data to convert to NA since we don't want to overwrite the Yield data yNA[whichNA, 1] <- NA #Put the test set as NA, but make sure the weights are not set to NA as they are needed for computation of estimates. } else if (str_detect(args$cv_method, pattern = "BASELINE")){ #' #For Baseline the parameter input will be BASELINE_n, where n is the percentage of data to be left out at random from the dataset. #Seed is set using the number i for the fold so that each fold will have started from the same seed regardless of what the data is left out. print("Running BASELINE for creating sample size curve") set.seed(2082021) #Set the seed to get 10 random numbers for use in fold creating seeds <- .Random.seed[1:10] #Grab the new set of random seeds set.seed(seeds[as.numeric(i)]) test <- rbinom(n = nrow(PhenoG2F), size = 1, prob = as.numeric(str_remove(args$cv_method, pattern = "BASELINE_"))/100) whichNA <- which(test == 1) yNA <- Yield #Grab the data to convert to NA since we don't want to overwrite the Yield data yNA[whichNA, 1] <- NA #Put the test set as NA, but make sure the weights are not set to NA as they are needed for computation of estimates. } else { cat("Incorrect Specification for Cross Validation Method. \n", "Currently the following methods are supported: \n", "Baseline downsampling to determine sample size effect, \n", "CV1: Leave out 10% of Hybrids in all locations \n", "CV2: Burgueno Method (2012) \n", "CV3: Leave out 10% of Hybrids and 1 Year of Environments \n", "CV4: Leave out Hybrids using PCA Clustering and 1 Year of Environments \n", "CV5: Leave out Hybrids using PCA Clustering and Related Set of Envs from Env Clustering \n", "CV6: Leave out 10% of Hybrids and Related Set of Envs from Env Clustering \n", "LOYO: Leave out One Year of Environments (LOYO) \n", "LORE: Leave out a Set of Related Environments (LORE) \n", "LORH: Leave out a Set of Related Hybrids (LORH) \n", "CV65: Leave out One Environment (Similar to LOOCV) \n") } #Do feature selection here: -test on CV1 method # 1) Perform selection by running models setting hybrid as IDVG # 2) Grab the features that should be used for the GxE Term (10 best features or 10 best factors (selection vs dim reduction) # 3) direct matrix multiplication with D matrix and selected covariates # 4) Output what features were selected # 5) For Factor model predict scores for missing envs, for features selection just select the features # 5) Remember to remake ETA object #### Determine GxE term #### if(args$model == "Full_GxE_Dim_Reduction"){ Weather <- Full_Weather_5Day %>% distinct() row.names(Weather) <- Weather$Env library("psych") #FA_Mod <- fa(Weather[, -c(1:6)], nfactors = 10, rotate = "varimax", scores = "regression", fm = "minres") cat("\n", "Executing PCA model on environmental data") PC_Mod <- prcomp(x = Weather[, -c(1:6)], rank. = 10) summary(PC_Mod) #Extract PC scores PC_Extract <- as.data.frame(PC_Mod$x) PC_Extract$Env <- row.names(PC_Extract) PC_Extract <- PC_Extract %>% right_join(Weather[, c(1:3)]) %>% dplyr::select(Env, Experiment, Year, everything()) #Save Factors out, save out percentage of variance cat("\n", "Saving out PC Model.") save(PC_Mod, file = paste0(WorkDirectory, outdir, "/", args$cv_method, args$model, "_fold_", i, "_PCA.RData")) # Scale scores back up and create GxE term: GxE_Part <- Full_Weather_5Day %>% dplyr::select(Env) %>% left_join(PC_Extract) GxE_Term <- matrix(nrow = nrow(Full_Weather_5Day)) for(m in 1:(ncol(GxE_Part) - 3)){ cat("\n", paste0("Working on GxE Part ", m, "of ", ncol(GxE_Part)-3)) GxE_tmp <- D_Markers_Full[, -1] * GxE_Part[, (m+3)] names(GxE_tmp) <- paste0(names(D_Markers_Full[, -1]), "_Factor_", m) GxE_Term <- cbind(GxE_Term, GxE_tmp) rm(GxE_tmp) } GxE_Term <- GxE_Term[ , -1] print(dim(GxE_Term)) cat("Does ncol(GxE_Term) = Number of markers * 10?", "\n", "ncol(GxE_Term): ", ncol(GxE_Term), "\n", "ncol(Markers)*10: ", ncol(D_Markers_Full[, -1])*10, sep = "") #Use as input for prediction eta$GxE = list(X = GxE_Term, model = "BL") #Now for GxE Using Feature selection } else if(args$model == "Full_GxE_Feat_Selection"){ stop("Feature selection has not been implemented at this point, please pick a different model.") # Weather <- PhenoG2F %>% dplyr::select(Hybrid) # Weather <- cbind(Weather, Full_Weather_5Day) # Covariates <- names(Weather)[-c(1:7)] # Full_Covariates <- Covariates # # bags_ind <- PhenoG2F$RandHybFold %% 2 #check if the number is even # bags <- ceiling(PhenoG2F$RandHybFold/2) #Change the random hybrid folds to being 5 sets for bagging (20%) or data each # # for(bag in bags){ #Design bag to exclude hybrids completely? 10% data in a bag, move to 20%, 30%, # bag_y <- yNA[bags == bag, ] # bag_weather <- Weather[bags == bag, ] # # cat("Making Model with no Env Cov GxE", "\n") # cur.mod <- lm(bag_y$Yield_Mg_ha_BLUE ~ Hybrid + Env, data = bag_weather) # New_Y <- cur.mod$residuals # pre.mod <- cur.mod #Record the model that was used prior to the loop # cur_cov <- NA # cur_mod_BIC <- stats::BIC(cur.mod) # model_stats <- data.frame(t(c(cur_cov, cur_mod_BIC))) # names(model_stats) <- c("Covariate", "BIC") # cat("Making Initial Models", "\n") # # for(m in (1:length(Covariates))){ # #Make the full model: # full.mod <- lm(formula = bag_y$Yield_Mg_ha_BLUE ~ Hybrid + Env + Hybrid:bag_weather[, Covariates[m]], data = bag_weather) # full_mod_BIC <- stats::BIC(full.mod) # # tally <- c(Covariates[m], full_mod_BIC) # model_stats <- rbind(model_stats, tally) # } # # if(model_stats$BIC[is.na(model_stats$Covariate)] == min(model_stats$BIC)){ # print("No GxE Covariates will be used") # } else { # cat("One covariate was added") # } # # #This was as far as I got -i kept running into this if-else spitting out that we wouldn't use any GxE covariates. # # Added_Covs <- model_stats$Covariate[model_stats$BIC == min(model_stats$BIC)] # Covariates <- Full_Covariates[!(Full_Covariates %in% Added_Covs)] # Added_Format <- paste0("Hybrid*Weather[, '", Added_Covs, "']") # model = paste0("yNA$Yield_Mg_ha_BLUE ~ Hybrid + Env + ", paste(Added_Format, collapse = " + ")) # cur.mod <- lm(formula = as.formula(model), data = Weather) # #cur_cov <- Added_Covs # cur_mod_BIC <- stats::BIC(cur.mod) # #model_stats <- data.frame(t(c(cur_cov, cur_mod_BIC))) # #names(model_stats) <- c("Covariate", "BIC") # while(length(Added_Covs) < 10){ # cat("Number of Added Covariates: ", length(Added_Covs), "\n") # Added_Format <- paste0("Hybrid:Weather[, '", Added_Covs, "']") # model = paste0("yNA$Yield_Mg_ha_BLUE ~ Hybrid + Env + ", paste(Added_Format, collapse = " + ")) # cur.mod <- lm(as.formula(model), data = Weather) # #cur_cov <- Added_Covs # cur_mod_BIC <- stats::BIC(cur.mod) # # for(m in 1:length(Covariates)){ # full.mod <- lm(formula = as.formula(paste0(model, "+ Hybrid:Weather[, Covariates[m]]")), data = Weather) # full_mod_BIC <- stats::BIC(full.mod) # # tally <- c(Covariates[m], full_mod_BIC) # model_stats <- rbind(model_stats, tally) # } # if(cur_mod_BIC == min(model_stats$BIC)){ # cat("Fitting ", length(Added_Covs), " environment covariates for GxE") # break() # } else { # Added_Covs <- as.character(c(Added_Covs, model_stats$Covariate[model_stats$BIC == min(model_stats$BIC)])) # Covariates <- Full_Covariates[!(Full_Covariates %in% Added_Covs)] # } # } # } # } # # Figure out how to select 10 env covariates from the lm on this given set, # agregate those 10 into a selected category # run again (10x) # pick the most used covarites as the GxE covariates # } # #Make the Baseline model (No Env Covs) # cat("Making Model with no Env Cov GxE", "\n") # cur.mod <- lm(yNA$Yield_Mg_ha_BLUE ~ Hybrid + Env, data = Weather) # New_Y <- cur.mod$residuals # pre.mod <- cur.mod #Record the model that was used prior to the loop # cur_cov <- NA # cur_mod_BIC <- stats::BIC(cur.mod) # model_stats <- data.frame(t(c(cur_cov, cur_mod_BIC))) # names(model_stats) <- c("Covariate", "BIC") # cat("Making Initial Models", "\n") # for(m in (1:2)){ # #Make the full model: # full.mod <- lm(formula = New_Y ~ Hybrid:Weather[, Covariates[m]], data = Weather) # full_mod_BIC <- stats::BIC(full.mod) # # tally <- c(Covariates[m], full_mod_BIC) # model_stats <- rbind(model_stats, tally) # } # # if(model_stats$BIC[is.na(model_stats$Covariate)] == min(model_stats$BIC)){ # print("No GxE Covariates will be used") # } else { # Added_Covs <- model_stats$Covariate[model_stats$BIC == min(model_stats$BIC)] # Covariates <- Full_Covariates[!(Full_Covariates %in% Added_Covs)] # Added_Format <- paste0("Hybrid*Weather[, '", Added_Covs, "']") # model = paste0("yNA$Yield_Mg_ha_BLUE ~ Hybrid + Env + ", paste(Added_Format, collapse = " + ")) # cur.mod <- lm(formula = as.formula(model), data = Weather) # #cur_cov <- Added_Covs # cur_mod_BIC <- stats::BIC(cur.mod) # #model_stats <- data.frame(t(c(cur_cov, cur_mod_BIC))) # #names(model_stats) <- c("Covariate", "BIC") # while(length(Added_Covs) < 10){ # cat("Number of Added Covariates: ", length(Added_Covs), "\n") # Added_Format <- paste0("Hybrid:Weather[, '", Added_Covs, "']") # model = paste0("yNA$Yield_Mg_ha_BLUE ~ Hybrid + Env + ", paste(Added_Format, collapse = " + ")) # cur.mod <- lm(as.formula(model), data = Weather) # #cur_cov <- Added_Covs # cur_mod_BIC <- stats::BIC(cur.mod) # # for(m in 1:length(Covariates)){ # full.mod <- lm(formula = as.formula(paste0(model, "+ Hybrid:Weather[, Covariates[m]]")), data = Weather) # full_mod_BIC <- stats::BIC(full.mod) # # tally <- c(Covariates[m], full_mod_BIC) # model_stats <- rbind(model_stats, tally) # } # if(cur_mod_BIC == min(model_stats$BIC)){ # cat("Fitting ", length(Added_Covs), " environment covariates for GxE") # break() # } else { # Added_Covs <- as.character(c(Added_Covs, model_stats$Covariate[model_stats$BIC == min(model_stats$BIC)])) # Covariates <- Full_Covariates[!(Full_Covariates %in% Added_Covs)] # } # } # } # } # # # #model_stats <- data.frame(t(c(cur_cov, cur_mod_BIC))) # #names(model_stats) <- c("Covariate", "BIC") # # for(m in 1:length(Covariates)){ # full.mod <- lm(formula = yNA$Yield_Mg_ha_BLUE ~ Hybrid + Env + Hybrid*Weather[, Added_Covs] + Hybrid*Weather[, Covariates[m]], data = Weather) # full_mod_BIC <- stats::BIC(full.mod) # # tally <- c(Covariates[m], full_mod_BIC) # model_stats <- rbind(model_stats, tally) # } # if(cur_mod_BIC == min(model_stats$BIC)){ # print("Fitting two environment covariates for GxE") # } # } # #Now for GxE with PCs on the G side, and env variables still used rather than summarized } else if(args$model == "Full_GxE_PC_G") { D_Markers_10k <- distinct(D_Markers_Full) row.names(D_Markers_10k) <- D_Markers_10k[, 1] Hybrid_PCA <- prcomp(x = D_Markers_10k[, -1]) PCA_Sel <- as.data.frame(Hybrid_PCA$x[, 1:265]) #Select 265 PCs to use to approximate the genetic component, to keep dimensions the same between the PC on ENV and the PC on G (100k). This accounts for 70% of genetic variance PCA_Sel$Hybrid <- row.names(PCA_Sel) PCA_Sel <- PCA_Sel %>% select(Hybrid, everything()) Hyb_PCA_Full <- PhenoG2F %>% select(Hybrid) %>% left_join(y = PCA_Sel) GxE_Term <- matrix(nrow = nrow(Full_Weather_5Day)) for(m in 1:(ncol(Hyb_PCA_Full)-1)){ cat("\n", paste0("Working on GxE Part ", m, " of ", (ncol(Hyb_PCA_Full) - 1), ".")) GxE_tmp <- Full_Weather_5Day[, -c(1:6)] * Hyb_PCA_Full[, (m + 1)] names(GxE_tmp) <- paste0(names(Full_Weather_5Day)[-c(1:6)], "_", names(Hyb_PCA_Full)[m + 1]) GxE_Term <- cbind(GxE_Term, GxE_tmp) } GxE_Term <- GxE_Term[, -1] print(dim(GxE_Term)) cat("Does ncol(GxE_Term) = Number of PCs * Number of Weather variables (minus ID variables)?", "\n", "ncol(GxE_Term): ", ncol(GxE_Term), "\n", "ncol(Weather)*number of PCs: ", ncol(Full_Weather_5Day[, -c(1:6)])*265, sep = "") #Use as input for prediction eta$GxE = list(X = GxE_Term, model = "BL") } else { print("Fitting Model without GxE Term") } # Added_Cov <- model_stats$ # min.mod <- lm(yNA$Yield_Mg_ha_BLUE ~ Hybrid + Env, data = Weather) # #Make the full model: # full.mod <- lm(formula = yNA$Yield_Mg_ha_BLUE ~ .^2, data = Weather[, -2]) # # Step_Mod <- stepAIC(min.mod, scope = list(upper = full.mod, lower = min.mod), direction = "forward", k = log(nrow(Weather3YRCenter)), trace = FALSE, steps = 15) #Use the K option to give BIC rather than AIC # # stepAIC(lower = Hybrid + Env, upper = Hybrid + Env + Hybrid*Weather[, -c(1:6)]), #Stop at 10 features, just from Yield = stepAIC(Hybrid + Env + Hybrid*Weather) #Force the Hybrid, and # GxE_Part = Weather[, selected_features] # eta$GxE = GxE_Part #### Run the BGLR Model #### #weights = sqrt(yNA$Yield_Mg_ha_weight) Test_Mod <- BGLR(y = yNA$Yield_Mg_ha_BLUE, weights = weights, ETA = eta, burnIn = burn, nIter = niter, response_type = "gaussian", thin = 10, saveAt = paste(WorkDirectory, outdir, "/CV1_", args$model, "_fold_", i, "_", sep = ""), verbose = TRUE) summary(Test_Mod) print(paste0("Model fitting complete for ", i, "th fold, moving to output generation.")) #Save out the Test_Mod object with it's respective fold name save(Test_Mod, file = paste(WorkDirectory, outdir, "/", args$cv_method, "_", args$model, "_fold_", i, "_Model.RData", sep = "")) print("Outputing prediction results") #Now work with the predictions Preds <- {} #Make an empty place to store the predictions Preds$GEBV <- Test_Mod$yHat[whichNA] #Extract Predictions Preds$Phenotype <- Yield$Yield_Mg_ha_BLUE[whichNA] #Extract the observed values Preds <- as.data.frame(Preds) #Switch to data frame object Preds$Hybrid <- attr(Yield, "Hybrid")[whichNA] #output identifying information of Hybrid Preds$Env <- attr(Yield, "Env")[whichNA] #output identifying information of Env Preds$Fold <- i #output the fold so we know what fold everything is from CV_Out <- rbind(CV_Out, Preds) #Bind everything onto the bottom of the dataframe accumulating the results print("Calculating tab results.") #For tabulated results, do for each fold, and also within sites for each fold -then average across sites for each fold tab <- list() tab$fold <- i tab$PredAbi <- round(cor(x = Preds$Phenotype, y = Preds$GEBV, use = "pairwise.complete.obs"), 3) #PredAbi correlation across all sites #Make a cor within each site, then also average across all sites print("Calculating within environment results") for (k in 1:length(unique(Preds$Env))) { WithinEnv <- {} Env_k <- sort(unique(Preds$Env))[k] fold <- i WithinEnv$Env <- Env_k WithinEnv$fold <- fold WithinEnv$PredAbi <- round(cor(x = Preds$Phenotype[Preds$Env == Env_k], y = Preds$GEBV[Preds$Env == Env_k], use = "pairwise.complete.obs"), 3) #Within Env Predictive Ability WithinEnv$RankCor <- round(cor(x = Preds$Phenotype[Preds$Env == Env_k], y = Preds$GEBV[Preds$Env == Env_k], use = "pairwise.complete.obs", method = "spearman"), 3) #Within Env Rank Correlation WithinEnv$MSE <- round(mean((Preds$Phenotype[Preds$Env == Env_k] - Preds$GEBV[Preds$Env == Env_k])^2, na.rm = TRUE), 3) #MSE between observed and predicted WithinEnv$Best10 <- round(mean(tail(sort(Preds$Phenotype[Preds$Env == Env_k], decreasing = TRUE), n = ceiling(nrow(Preds[Preds$Env == Env_k, ]) * 0.1))), 3) WithinEnv <- as.data.frame(WithinEnv) W_Env <- rbind(W_Env, WithinEnv) } print("generating remainder of tab results.") tab$RankCor <- round(cor(x = Preds$Phenotype, y = Preds$GEBV, use = "pairwise.complete.obs", method = "spearman"), 3) #Maybe we don't care about this #Also do within site tab$MSE <- round(mean((Preds$Phenotype - Preds$GEBV)^2, na.rm = TRUE), 3) #MSE between observed and predicted tab$Best10 <- round(mean(tail(sort(Preds$Phenotype, decreasing = T), n = ceiling(nrow(Preds) * 0.1))), 3) #Grab the best 10% of yields tab$logLikAtPostMean <- Test_Mod$fit$logLikAtPostMean tab$postMeanLogLik <- Test_Mod$fit$postMeanLogLik tab$pD <- Test_Mod$fit$pD tab$DIC <- Test_Mod$fit$DIC tab$varE <- Test_Mod$varE tab$summary <- summary(Test_Mod) tab <- as.data.frame(tab) end_time <- proc.time() tab$User_Time <- end_time[1] - start_time[1] tab$System_Time <- end_time[2] - start_time[2] tab$Elapsed_Time <- end_time[3] - start_time[3] tab_results <- rbind(tab_results, tab) start_time <- end_time #Replace the start time for the next iteration #### Write out results#### print(paste0("writing results to CSV files. Files can be found in ", WorkDirectory, outdir)) write.csv(CV_Out, file = paste(WorkDirectory, outdir, "/", args$cv_method, "_", args$model, "_fold_", i, "_Predictions.csv", sep = ""), row.names = FALSE) #Write the predictions out to the file write.csv(tab_results, file = paste(WorkDirectory, outdir, "/", args$cv_method, "_", args$model, "_fold_", i , "_tab_results.csv", sep = ""), row.names = FALSE)#Write the tab results out to a file write.csv(W_Env, file = paste(WorkDirectory, outdir, "/", args$cv_method, "_", args$model, "_fold_", i, "_WithinEnv_Results.csv", sep = ""), row.names = FALSE)#write the within Env results out to a file #} #Compute the accuracy for each fold individually -maybe some groupings are more difficult to predict than others. #fold_accuracy_CV1 <- CV_Out %>% group_by(Fold) %>% summarise(Cor = cor(x = GEBV, y = Phenotype)) #Compute overall accuracy #Overall_Accuracy_CV1 <- cor(x = CV_Out$GEBV, y = CV_Out$Phenotype) #write.csv(CV_Out, file = paste(WorkDirectory, "Test/CV1_Predictions.csv", sep = ""), row.names = FALSE) #Write the predictions out to the file #write.csv(tab_results, file = paste(WorkDirectory, "Test/CV1_tab_results.csv", sep = ""), row.names = FALSE)#Write the tab results out to a file #write.csv(W_Env, file = paste(WorkDirectory, "Test/CV1_WithinEnv_Results.csv", sep = ""), row.names = FALSE)#write the within Env results out to a file print(paste0("Modeling and Output Complete for ", args$model, " fold ", i, "of ", args$cv_method, ".")) #Remember to move output results to a space other than the share drive))