--- title: "Analysis of Environments 14-16, Combining Data files into multiyear files" author: "Anna R Rogers, NCSU" date: "November 2, 2017" output: html_document --- ```{r} Weather14Means <- read.csv("C:/Users/arroger4/Google Drive/Anna/Weather14Means.csv", header = TRUE) Weather16Means <- read.csv("") Weather15Means <- read.csv() ``` #Need to change IAH# in 2016 to match 2014 and 2015 data in both weather and phenotype data files #Done in excel 11/6/2017. -ARR ```{r} #Anova on MNH 2016 data to check that yield differs between hybrids. #MNH16 <- ThreeYearPheno[ThreeYearPheno$Env == "MNH1_2016",] #MNH16_lm <- lm(Yield ~ Line, data = MNH16) #anova(MNH16_lm) ``` #Need to make three year files for each type of data (Phenotype and Weather). Genotype three year is already made from tassel, just need to add on to matrix like done in 2014 ```{r} library(gtools) Pheno14 <- read.csv("G:/My Drive/Anna/G2F Phenotype Data/g2f_2014_hybrid_data_no_outliers.csv", header = TRUE) Pheno15 <- read.csv("Q:/My Drive/Anna/G2F Phenotype Data/g2f_2015_hybrid_data_no_outliers.csv", header = TRUE) Pheno16 <- read.csv("G:/My Drive/Anna/G2F Phenotype Data/g2f_2016_hybrid_data_no_outliers.csv", header = TRUE) Pheno14$Year <- 2014 Pheno15$Year <- 2015 Pheno16$Year <- 2016 names(Pheno14) names(Pheno15) names(Pheno16) #Need to coerce the dataframes all into same dimensions and roder to use RBIND function. This totally sucks but what the heck we've already been doing this for 60 katrillion years wooooooooo #I'm keeping 2014 as our base set (it has the core columns that I think are worth keeping) #1) Rename Replicate in 2015 and 2016 to Rep like it is in 2014 colnames(Pheno15)[8] <- "Rep" colnames(Pheno16)[17] <- "Rep" #2) Change testweight column in 2015 to match name of same measurement in 14/16 data colnames(Pheno15)[26] <- "Test.Weight..lbs.bu." colnames(Pheno15)[28] <- "Grain.yield..bu.A." colnames(Pheno15)[13] <- "Plot.area" #Add in Heterotic Pool column to 2015 and 2016, as well as experiment column Pheno15$Heterotic.Pool <- NA Pheno16$Heterotic.Pool <- NA Pheno15$Experiment <- Pheno15$Field.Location Pheno16$Block <- NA #There was no block info for 2016 data #Remove columns for individual plant ear and plant heights, Northern leaf blight ratings, PLTHT.tassel, from 15 data Pheno15a <- Pheno15[,c(-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49)] #Remove columns for field.plot.name (All of them are Ames), plot.length.field, alt.plot.area, alley,length, row.spacing, row.perpacket, X.seed from 16 data. Pheno16a <- Pheno16[,c(-7,-8,-10,-11,-12,-13,-14)] #Add a damage.plants column to 2016 Pheno16a$Damaged.plants <- NA #Removing extra columns for lodging.rating, percent.northern.leaf.blight, raccoon.damage.rating, Plant.height.cm.to.tip.tassel. Pheno14a <- Pheno14[,c(-35,-36,-37,-38)] #Filters for 2014 were applied on the basis of comments, plot.dicarded, and filler columns. Will use these in 2015 and 2016 as well. Pheno1415 <- smartbind(Pheno14a, Pheno15a) #Combine all three years into one dataframe Pheno_All_Years_Combined <- smartbind(Pheno1415, Pheno16a) #Save Combined Years Phenotype Data #write.csv(x = Pheno_All_Years_Combined, file = "C:/Users/arroger4/Google Drive/Anna/G2f_Pheno_2014_15_16.csv", row.names = FALSE) ``` Filter the phenotype data to clean up. ```{r} ThreeYearPheno <- read.csv("C:/Users/spiro/Google Drive/Anna/G2F Phenotype Data/G2f_Pheno_2014_15_16.csv", header = TRUE) names(ThreeYearPheno)[names(ThreeYearPheno) == "Grain.yield..bu.A."] = "Yield" names(ThreeYearPheno)[names(ThreeYearPheno) == "Pedigree"] = "Line" names(ThreeYearPheno)[names(ThreeYearPheno) == "Field.Location"] = "Experiment" Reciprocal <- as.character(unique(ThreeYearPheno$Line)) Splits <- strsplit(x = Reciprocal, split = "/") #Issue, some entries in the list have 4 strings, some ahave 3, some have 2 SplitsA <- t(sapply(Splits, function(x) {x[c(1,length(x))]})) Reciprocal <- as.data.frame(Reciprocal) Reciprocal$Reciprocal <- as.character(Reciprocal$Reciprocal) Reciprocal$P1 <- SplitsA[,2] Reciprocal$P2 <- SplitsA[,1] Reciprocal$Cross <- paste(Reciprocal$P1, Reciprocal$P2, sep = "/") #Search the Reciprocal ListRec <- Reciprocal$Cross[Reciprocal$Cross %in% Reciprocal$Reciprocal] #Change the names of the reciprocal crosses ThreeYearPheno$Line[ThreeYearPheno$Line == "LH162/B47"] <- "B47/LH162" ThreeYearPheno$Line[ThreeYearPheno$Line == "LH185/B47"] <- "B47/LH185" ThreeYearPheno$Line[ThreeYearPheno$Line == "B47/LH38"] <- "LH38/B47" ThreeYearPheno$Line[ThreeYearPheno$Line == "B47/LH51"] <- "LH51/B47" ThreeYearPheno$Line[ThreeYearPheno$Line == "B47/LH82"] <- "LH82/B47" ThreeYearPheno$Line[ThreeYearPheno$Line == "B47/PHG29"] <- "PHG29/B47" ThreeYearPheno$Line[ThreeYearPheno$Line == "B47/PHG47"] <- "PHG47/B47" ThreeYearPheno$Line[ThreeYearPheno$Line == "B47/PHG83"] <- "PHG83/B47" ThreeYearPheno$Line[ThreeYearPheno$Line == "B47/PHN82"] <- "PHN82/B47" ThreeYearPheno$Line[ThreeYearPheno$Line == "B47/PHR25"] <- "PHR25/B47" ThreeYearPheno$Line[ThreeYearPheno$Line == "PHZ51/B47"] <- "B47/PHZ51" ThreeYearPheno$Line[ThreeYearPheno$Line == "B47/Q381"] <- "Q381/B47" ThreeYearPheno$Line[ThreeYearPheno$Line == "CG60/CG102"] <- "CG102/CG60" ThreeYearPheno$Line[ThreeYearPheno$Line == "LH82/CG102"] <- "CG102/LH82" ThreeYearPheno$Line[ThreeYearPheno$Line == "CGR03/CG108"] <- "CG108/CGR03" ThreeYearPheno$Line[ThreeYearPheno$Line == "CGR03/CG44"] <- "CG44/CGR03" ThreeYearPheno$Line[ThreeYearPheno$Line == "LH162/CG60"] <- "CG60/LH162" ThreeYearPheno$Line[ThreeYearPheno$Line == "LH82/CG60"] <- "CG60/LH82" ThreeYearPheno$Line[ThreeYearPheno$Line == "LH195/CGR01"] <- "CGR01/LH195" ThreeYearPheno$Line[ThreeYearPheno$Line == "LH198/CGR01"] <- "CGR01/LH198" ThreeYearPheno$Line[ThreeYearPheno$Line == "LH82/CGR01"] <- "CGR01/LH82" ThreeYearPheno$Line[ThreeYearPheno$Line == "LH198/LH185"] <- "LH185/LH198" ThreeYearPheno$Line[ThreeYearPheno$Line == "LH195/LH38"] <- "LH38/LH195" ThreeYearPheno$Line[ThreeYearPheno$Line == "LH195/LH51"] <- "LH51/LH195" ThreeYearPheno$Line[ThreeYearPheno$Line == "LH145/LH82"] <- "LH82/LH145" ThreeYearPheno$Line[ThreeYearPheno$Line == "LH195/LH82"] <- "LH82/LH195" ThreeYearPheno$Line[ThreeYearPheno$Line == "LH198/LH82"] <- "LH82/LH198" ThreeYearPheno$Line[ThreeYearPheno$Line == "PHZ51/PHRE1"] <- "PHRE1/PHZ51" ThreeYearPheno$Line[ThreeYearPheno$Line == "PHZ51/LH145"] <- "LH145/PHZ51" ThreeYearPheno$Line[ThreeYearPheno$Line == "PHZ51/LH195"] <- "LH195/PHZ51" ThreeYearPheno$Line[ThreeYearPheno$Line == "PHZ51/PB80"] <- "PB80/PHZ51" #Write out the corrected hybrid phenotype table #write.csv(ThreeYearPheno, "C:/Users/arroger4/Google Drive/Anna/G2F Phenotype Data/ThreeYearPheno_RecipRem.csv") ``` #Between this needed to compute yield values for MNH1_2016 and also change weird Z names to normal Z names. ```{r} #Read in Three Year Phenotype ThreeYrPheno <- read.csv("G:/My Drive/Anna/G2F Phenotype Data/ThreeYearPheno_RecipRem_MNSCHImputed.csv", header = TRUE) ThreeYearPheno <- ThreeYrPheno[,-1] #Remove all local checks #1) Read in Hybrid Names HybridParents <- read.csv("G:/My Drive/Anna/G2FHybrid genotypes/HybridParents_2014_2016.txt", sep = "\t", header = TRUE) #2) MAke the Hybrid NAmes concatenated HybridParents$Concat <- paste(HybridParents$Parent1,HybridParents$Parent2, sep = "/") head(HybridParents) #Get only the individuals we have genotype information on: #Read in a data file to pick up the names, we'll read in the A matrix since it's pretty small. A <- read.table("G:/My Drive/Anna/G2F_Hybrid_Genomic_Data_To_Analysis/G2FHybrids_AdditiveRelationshipMatrix.txt", header = FALSE, skip = 3) #Pick out the hybrid parent names from A. A[1:10,1:10] HybridsWGenos <- A$V1 head(HybridsWGenos) ThreeYearPhenoRR <- ThreeYearPheno[ThreeYearPheno$Line %in% HybridsWGenos, ] ThreeYearPhenoRR$Line <- as.character(ThreeYearPhenoRR$Line) ThreeYearPhenoRR$Line <- as.factor(ThreeYearPhenoRR$Line) ThreeYearPhenoHybsOnly <- ThreeYearPheno[ThreeYearPheno$Line %in% HybridsWGenos,] #Remove observations that were filler, plot is said to be discarded, or grain yield = zero ThreeYearPhenoHybsOnly$Yield[ThreeYearPhenoHybsOnly$Filler == "Filler"] <- NA ThreeYearPhenoHybsOnly$Yield[ThreeYearPhenoHybsOnly$Filler == "No seed"] <- NA ThreeYearPhenoHybsOnly$Yield[ThreeYearPhenoHybsOnly$Filler == "Alt source subbed"] <- NA ThreeYearPhenoHybsOnly$Yield[ThreeYearPhenoHybsOnly$Filler == "Planting error"] <- NA ThreeYearPhenoHybsOnly$Yield[ThreeYearPhenoHybsOnly$Filler == "Questionable"] <- NA ThreeYearPhenoHybsOnly$Yield[ThreeYearPhenoHybsOnly$Filler == "Yes"] <- NA ThreeYearPhenoHybsOnly$Yield[ThreeYearPhenoHybsOnly$Filler == "No Seed-sub label printed"] <- NA ThreeYearPhenoHybsOnly$Yield[ThreeYearPhenoHybsOnly$Filler == "No seed-sub made"] <- NA ThreeYearPhenoHybsOnly$Yield[ThreeYearPhenoHybsOnly$Plot.Discarded == "Yes"] <- NA ThreeYearPhenoHybsOnly$Line[ThreeYearPhenoHybsOnly$Comments == "Replaced by B73/Mo17"] <- "B73/MO17" ThreeYearPhenoHybsOnly$Yield[ThreeYearPhenoHybsOnly$Yield == 0] <- NA #Do we want ones with less than a certain threshold discarded? #Subset only lines that have a yield value ThreeYearPhenoHybsOnly_Yield <- ThreeYearPhenoHybsOnly[!(is.na(ThreeYearPhenoHybsOnly$Yield)),] ThreeYearPhenoHybsOnly_Yield$Line <- as.character(ThreeYearPhenoHybsOnly_Yield$Line) ThreeYearPhenoHybsOnly_Yield$Line <- as.factor(ThreeYearPhenoHybsOnly_Yield$Line) str(ThreeYearPhenoHybsOnly_Yield) #Down to 1921 unique hybrid lines unique(HybridsWGenos[!(HybridsWGenos %in% ThreeYearPhenoHybsOnly$Line)]) #One line was not in Pheno File GEMN-0260/B47 unique(ThreeYearPheno$Line[!(ThreeYearPheno$Line %in% HybridsWGenos)]) #Local checks and a nmber of teh phenotyped lines do not have genotype information unique(ThreeYearPhenoRR$Line[!(ThreeYearPhenoRR$Line %in% ThreeYearPhenoHybsOnly_Yield$Line)]) #These hybrids were eliminnated during the step where we got rid of things with Yield = NA: [1] CG123/PHHV4, M0125/CG102, MBNIL_B107/3IIH6, MBNIL_B144/PHZ51, PHW52/PHN37, we will need to remove these from teh A matrix and D matrix, along with removing GEMn-0260/B47. #Output subset of all hybrids with Yield Value write.csv(ThreeYearPhenoHybsOnly_Yield,"C:/Users/spiro/Google Drive/Anna/G2F Phenotype Data/ThreeYearPheno_HybsOnly_Filtered_05012018.csv", row.names = FALSE) ``` #Need to match all pedigree names to proper naming schemes -> in excel since GREP won't handle the weird mismatches, takes just as long to code as to check by hand. Doing it by hand so I can ensure data integrity ```{r} ThreeYearPheno <- read.csv("G:/My Drive/Anna/G2F Phenotype Data/ThreeYearPheno_HybsOnly_Filtered_05012018.csv", header = TRUE) ThreeYearPheno <- ThreeYearPhenoHybsOnly_Yield #Currently at 33605 observations, soon to be less..... #Put in the stand counts for Nebraska 1 and 4 for 2016. According to Oscar, stand count was almost perfect, thus we will go with 70 count stand for 2016 in these locations. 76 seeds were planted per plot. ThreeYearPheno$Stand.Count..plants.[ThreeYearPheno$Experiment == "NEH1" & ThreeYearPheno$Year == 2015] <- 70 ThreeYearPheno$Stand.Count..plants.[ThreeYearPheno$Experiment == "NEH4" & ThreeYearPheno$Year == 2015] <- 70 ThreeYearPheno$Stand.Count..plants.[ThreeYearPheno$Experiment == "MNH1" & ThreeYearPheno$Year == 2016] <- 70 #Read in the Genotype data to extract the hybrid names #HybGenos <- read.table("C:/Users/arroger4/Google Drive/Anna/G2F_Hybrid_Genomic_Data_To_Analysis/G2FHybrids_NumericGenos.txt", header = TRUE) #Hybnames <- HybGenos$Marker #write.csv(Hybnames, "C:/Users/arroger4/Google Drive/Anna/G2F_Hybrid_Genomic_Data_To_Analysis/HybNames.csv", row.names = FALSE) str(ThreeYearPheno) summary(ThreeYearPheno) ``` #Only after these things are done will we be able to do the analysis for venn comparisons, along with population structure, factor analysis for weather files #Do filtering for standcount and perform checks as Jim did with phenotype data ```{r} names(ThreeYearPheno) #What are the column names? #Adjust the column names names(ThreeYearPheno) <- c("RecId", "Experiment", "State", "City", "Code", "Source", "Line", "Experiment_Code", "Heterotic.Pool", "Rep", "Block", "Plot", "Range", "Pass", "Plot.area", "Date.Planted", "Date.Harvested", "Anthesis..date.", "Silking..date.", "Stand.Count..plants.", "Pollen.DAP..days.", "Silk.DAP..days.", "Plant.height..cm.", "Ear.height..cm.", "Root.lodging..plants.", "Stalk.lodging..plants.", "Grain.Moisture..percent.", "Test.Weight..lbs.bu.", "Plot.Weight..lbs.", "Yield", "Plot.Discarded", "Filler", "Comments", "Year" ) ThreeYearPheno$Stand_per_area <- ThreeYearPheno$Stand.Count..plants. / ThreeYearPheno$Plot.area ThreeYearPheno$Year <- as.factor(ThreeYearPheno$Year) ThreeYearPheno$Code <- as.factor(ThreeYearPheno$Code) ThreeYearPheno$Rep <- as.factor(ThreeYearPheno$Rep) ThreeYearPheno$Block <- as.factor(ThreeYearPheno$Block) ThreeYearPheno$Plot <- as.numeric(ThreeYearPheno$Plot) ThreeYearPheno$Heterotic.Pool <- as.factor(ThreeYearPheno$Heterotic.Pool) ThreeYearPheno$Range <- as.factor(ThreeYearPheno$Range) ThreeYearPheno$Pass <- as.factor(ThreeYearPheno$Pass) #According to Jim, label block as block 1 for 2016, fitting no block effects in that year. table(is.na(ThreeYearPheno$Block)) ThreeYearPheno$Block[is.na(ThreeYearPheno$Block)] <- 1 ``` ```{r} library(magrittr) library(dplyr) env_means_stand = group_by(ThreeYearPheno, Year, Experiment) %>% summarise(mean(Stand_per_area, na.rm = T)) names(env_means_stand)[3] = 'Stand_per_area_mn' str(env_means_stand) ``` ```{r} ThreeYearPheno = merge(ThreeYearPheno, env_means_stand, by = c("Year", "Experiment")) ThreeYearPheno = mutate(ThreeYearPheno, stand_lin_cov = Stand_per_area - Stand_per_area_mn, stand_quad_cov = stand_lin_cov^2, Env = paste(Experiment, Year, sep = "_")) str(ThreeYearPheno) ``` ```{r} for (var in c("Year", "Experiment_Code", "Block", "Rep", "Hybrid", "Pass", "Range")){ print(paste("Missing values for", var)) print(nrow(ThreeYearPheno[is.na(ThreeYearPheno[[var]]),])) if (nrow(ThreeYearPheno[is.na(ThreeYearPheno[[var]]),]) > 0){ print("Values Missing in These Environs:") print(unique(ThreeYearPheno[is.na(ThreeYearPheno[[var]]),"Env"]))} } ``` ```{r} #Put in the proper reps and blocks for TX and GA for 2015 ThreeYearPheno <- mutate(ThreeYearPheno, Rep = ifelse(Env == "GAH1_2015", ifelse(Plot < 251, 1, 2), Rep), Rep = ifelse(Env == "TXH1_2015", ifelse(Plot < 235, 1, 2), Rep), Block = ifelse(is.na(Block), 1, Block) ) ``` ```{r} #Distribution of yield by year yield.hist <- ggplot(ThreeYearPheno, aes(x = Yield)) + geom_histogram(position = 'identity', bins = 50) + facet_grid(facets = ~ Year) yield.hist ``` ```{r} #Check if any plots to be discarded have NA values any(ThreeYearPheno[!is.na(ThreeYearPheno$Yield),"Plot_Discarded"]) #returns NA sum(ThreeYearPheno[!is.na(ThreeYearPheno$Yield),"Plot_Discarded"], na.rm = T) ``` ```{r} #Check distribution of stand count stand.hist <- ggplot(ThreeYearPheno, aes(x = stand_lin_cov)) + geom_histogram(position = 'identity', bins = 50) + facet_grid(facets = ~ Year) stand.hist stand_area.hist = ggplot(ThreeYearPheno, aes(x = Stand_per_area)) + geom_histogram(position = 'identity', bins = 50) + facet_grid(facets = ~ Year) stand_area.hist ``` ```{r} #The units above are plants per ft^2. NC corn production guide says 19k plants per acre for low water soils, up to 29k plants per acre for good soils. Let's set threshold at 15k plants per acre. Convert that to plants per square foot: plants_acre = 15000 sq_ft_acre = 43560 threshold_stand_per_area = plants_acre/sq_ft_acre threshold_stand_per_area ThreeYearPheno = mutate(ThreeYearPheno, Yield = ifelse(Stand_per_area > threshold_stand_per_area & Stand_per_area < 1, Yield, NA)) year.mns = group_by(ThreeYearPheno, Year) %>% summarise(Yield = mean(Yield, na.rm = T)) yield.hist = ggplot(ThreeYearPheno, aes(x = Yield)) + geom_histogram(position = 'identity', bins = 50, color = 'blue') + facet_grid(facets = ~ Year) + xlab('Yield bu/A') + ggtitle('Distribution of yield within each year') + geom_vline(data = year.mns, aes(xintercept=Yield), color="red") + theme_bw() #ggsave(yield.hist, filename = paste0(outpath, "ThreeYearPheno yield histogram by year.png")) yield.hist ``` ```{r} #Censor yield data above 300 bu/A, seems very unlikely to be real. ThreeYearPheno = mutate(ThreeYearPheno, Yield = ifelse(Yield >= 300, NA, Yield)) #Plot the relationship between Yield and stand over all years yield.stand = ggplot(ThreeYearPheno %>% filter(!is.na(Yield) & !is.na(Stand_per_area)), aes(x = Stand_per_area, y = Yield)) + geom_point(size = 0.3, alpha = 0.2, colour = 'green') + xlab("Stand per are (plants/foot^2)") + ylab("Yield (bu/A") + ggtitle("Relationship between Yield and Stand") + theme_bw() #ggsave(yield.stand, filename = "G2F Yield-Stand relationships.png") yield.stand ``` Are there plots with Yield data that lack stand values? ```{r} nrow(ThreeYearPheno[!is.na(ThreeYearPheno$Yield) & is.na(ThreeYearPheno$Stand_per_area),]) ``` GOOD! Some environments have no observations for yield, cause the het error variance models to fail: 2.22.2018 Still no stand count information --> calling all of these as 70 stand. ```{r} nrow(ThreeYearPheno[ThreeYearPheno$Env == "NEH1_2015" & !is.na(ThreeYearPheno$Yield),]) #Look to see which plots form NEH1_2015 do not have NA yield. head(ThreeYearPheno[ThreeYearPheno$Env == "NEH1_2015",]) ``` ```{r} nrow(ThreeYearPheno[ThreeYearPheno$Env == "NEH4_2015" & !is.na(ThreeYearPheno$Yield),]) ``` Finally, convert yield, Stand, and test weights to Metric units. Bushels are computed assuming 56 lbs/bushel test weight: pounds per acre = 56*bu/A Mg per acre = 0.000453592(Mg/pound)*(pounds/acre) Mg per ha = 2.4710559990832(acre/ha)*(Mg/acre) test weight pounds per bushel to kg per cubic m kg per bushel = 0.453592(kg/pound) * (pound/bu) kg per m^3 = 28.3776(bushel/cu^3) * (kg/bu) kg per m^3 = 12.8719pound per bushel stand plants per acre plants per ha = (2.4710559990832)acre/ha * plants/acre ```{r} yield.conversion = 56*0.000453592*2.4710559990832 twt.conversion = 0.453592*28.3776 std.conversion = 2.4710559990832 ThreeYearPheno = mutate(ThreeYearPheno, Yield_Mg_ha = Yield*yield.conversion, Twt_kg_m3 = twt.conversion*Test.Weight..lbs.bu., Stand_per_ha = std.conversion*Stand_per_area) ``` #Remove the Observations that have been set to NA for yield. ```{r} table(is.na(ThreeYearPheno$Yield)) #33605 observations, 1094 of them are NA for yield ThreeYearPheno <- ThreeYearPheno[!is.na(ThreeYearPheno$Yield),] summary(ThreeYearPheno) table(ThreeYearPheno$Env) ThreeYearPheno$Env <- as.character(ThreeYearPheno$Env) ThreeYearPheno$Env[ThreeYearPheno$Env == "KSH3_2016"] <- "KSH2_2016" ThreeYearPheno$Env[ThreeYearPheno$Env == "IAH1a_2014"] <- "IAH1ab_2014" ThreeYearPheno$Env[ThreeYearPheno$Env == "IAH1b_2014"] <- "IAH1ab_2014" ThreeYearPheno$Env[ThreeYearPheno$Env == "MOH1_2015"] <- "MOH1_MOH2_2015" ThreeYearPheno$Env[ThreeYearPheno$Env == "MOH2_2015"] <- "MOH1_MOH2_2015" ThreeYearPheno$Env[ThreeYearPheno$Env == "NEH1_2015"] <- "NEH1_NEH4_2015" ThreeYearPheno$Env[ThreeYearPheno$Env == "NEH4_2015"] <- "NEH1_NEH4_2015" table(ThreeYearPheno$Env) ThreeYearPheno$Env <- as.factor(ThreeYearPheno$Env) ``` ```{r} ThreeYearPheno$Root.lodging..plants. = ifelse(is.na(ThreeYearPheno$Root.lodging..plants.), 0, ThreeYearPheno$Root.lodging..plants.) ThreeYearPheno$Stalk.lodging..plants. = ifelse(is.na(ThreeYearPheno$Stalk.lodging..plants.), 0, ThreeYearPheno$Stalk.lodging..plants.) ThreeYearPheno = ThreeYearPheno %>% mutate(Lodging = (Root.lodging..plants. + Stalk.lodging..plants.)/Stand.Count..plants., Lodging = ifelse(Lodging > 1, 1, Lodging), Comments = gsub(",", ".", Comments)) #get rid of commas, they mess up reading into csvs summary(ThreeYearPheno$Lodging) #look for outliers in the Silking and Pollen shed days plot(x = ThreeYearPheno$Pollen.DAP..days., y = ThreeYearPheno$Silk.DAP..days., ylab = "Silk DAP days", xlab = "Pollen DAP days") summary(ThreeYearPheno$Silk.DAP..days.) #Identify the outlier value in Silking Days ThreeYearPheno$Silk.DAP..days.[ThreeYearPheno$Silk.DAP..days. == 24] <- NA #Remove the really low value for silk days -24 days. Other outliers are 40 days to pollen shed, but isn't unheard of. 40 days to silk, wondering what that one is. -two observations from ARH2_2016 (jonesboro) ``` ```{r} #Look for outliers in ear height and plant height Height.Plot <- ggplot(ThreeYearPheno, aes(x = ThreeYearPheno$Ear.height..cm., y = ThreeYearPheno$Plant.height..cm.)) + geom_point() Height.Plot ThreeYearPheno$GAH2_2016_EH <- ifelse(test = ThreeYearPheno$Env == "GAH2_2016", yes = ThreeYearPheno$Ear.height..cm., no = NA) ThreeYearPheno$GAH2_2016_PH <- ifelse(test = ThreeYearPheno$Env == "GAH2_2016", yes = ThreeYearPheno$Plant.height..cm., no = NA) Height.Plot + geom_point(aes(x = ThreeYearPheno$GAH2_2016_EH, y = ThreeYearPheno$GAH2_2016_PH, color = "Red")) #I see the outliers that Jim is talking about, but I kind of want to see what lines they are and see if there's a trend. --Turns out trend is that they are mainly from GAH2_2016. Removing P and E's from this location -not trustworthy. #Will also remove all P and E's were Ear Height > Plant Height because it's not possible for the ear to be above the plant. which(ThreeYearPheno$Ear.height..cm. > ThreeYearPheno$Plant.height..cm., arr.ind = TRUE) #There are 8 observations where this is True #Also picks up the places where there is missing data for P and/or E. Sub <- filter(ThreeYearPheno, ThreeYearPheno$Ear.height..cm. > ThreeYearPheno$Plant.height..cm., !is.na(ThreeYearPheno$Ear.height..cm.)) ThreeYearPheno$Plant.height..cm.[ThreeYearPheno$Plant.height..cm. < ThreeYearPheno$Ear.height..cm.] <- NA ThreeYearPheno$Ear.height..cm.[ThreeYearPheno$Plant.height..cm. < ThreeYearPheno$Ear.height..cm.] <- NA ThreeYearPheno$Ear.height..cm.[ThreeYearPheno$Env == "GAH2_2016"] <- NA ThreeYearPheno$Plant.height..cm.[ThreeYearPheno$Env == "GAH2_2016"] <- NA #Still some outliers, but nothing too bad. Partially tempted to remove ones where the plant height is only a few centimeters above the ear height <- removed, June 2018 Height.Plot + geom_abline(slope = 1, intercept = 0, color = "red") ThreeYearPheno$Ear_Plant_Height_Ratio <- ThreeYearPheno$Ear.height..cm./ThreeYearPheno$Plant.height..cm. hist(ThreeYearPheno$Ear_Plant_Height_Ratio, breaks = 20) ThreeYearPheno$EPIndi <- as.factor(ifelse(ThreeYearPheno$Ear_Plant_Height_Ratio >= 0.75 | ThreeYearPheno$Ear_Plant_Height_Ratio <= 0.25, 1, 0)) Height.Plot + geom_point(aes(color = ThreeYearPheno$EPIndi)) + scale_color_manual(values = c("black", "red")) table(ThreeYearPheno$EPIndi) table(is.na(ThreeYearPheno$Ear.height..cm.)) table(is.na(ThreeYearPheno$Plant.height..cm.)) ThreeYearPheno$Plant.height..cm.[ThreeYearPheno$Ear_Plant_Height_Ratio >= 0.75 | ThreeYearPheno$Ear_Plant_Height_Ratio <= 0.25] <- NA ThreeYearPheno$Ear.height..cm.[ThreeYearPheno$Ear_Plant_Height_Ratio >= 0.75 | ThreeYearPheno$Ear_Plant_Height_Ratio <= 0.25] <- NA table(ThreeYearPheno$EPIndi) table(is.na(ThreeYearPheno$Ear.height..cm.)) table(is.na(ThreeYearPheno$Plant.height..cm.)) ``` ```{r} #do QC on stand counts, I have noticed some issues (stand.plot = ggplot(ThreeYearPheno, aes(x = Env, y = Stand.Count..plants.)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ) #identify outliers as 2.5*IQR and/or as above/below 30% and plot them outliers = group_by(ThreeYearPheno, Env) %>% summarise(Mean.std = mean(Stand.Count..plants.), IQR.std = IQR(Stand.Count..plants.)) %>% mutate(top.iqr = Mean.std + 2.5*IQR.std, bot.iqr = Mean.std - 2.5*IQR.std, top.perc = Mean.std*1.3, bot.perc = Mean.std*0.7) outliers.iqr = merge(outliers, ThreeYearPheno, by = "Env", all = T) %>% filter(Stand.Count..plants. < bot.iqr | Stand.Count..plants. > top.iqr) outliers.perc = merge(outliers, ThreeYearPheno, by = "Env", all = T) %>% filter(Stand.Count..plants. < bot.perc | Stand.Count..plants. > top.perc) outliers.both = rbind(outliers.iqr, outliers.perc) %>% filter(duplicated(.)) #keep only rows that are duplicated in both! (stand.plot + geom_point(data = outliers.iqr, colour = "red") + ggtitle("Outliers based on 2.5 IQR")) (stand.plot + geom_point(data = outliers.perc, colour = "green") + ggtitle("Outliers based on percentage of mean")) (stand.plot + geom_point(data = outliers.both, colour = "magenta") + ggtitle("Outliers based on percentage of mean AND IQR")) stand.plot #Remove all of the rows where the stand count is an outlier. -296 observations -> 32210 observations remain ThreeYearPheno <- ThreeYearPheno[!(ThreeYearPheno$RecId %in% outliers.both$RecId),] year.mns = group_by(ThreeYearPheno, Year) %>% summarise(Yield = mean(Yield, na.rm = T)) yield.hist = ggplot(ThreeYearPheno, aes(x = Yield)) + geom_histogram(position = 'identity', bins = 50, color = 'blue') + facet_grid(facets = ~ Year) + xlab('Yield bu/A') + ggtitle('Distribution of yield within each year') + geom_vline(data = year.mns, aes(xintercept=Yield), color="red") + theme_bw() yield.hist (yield.plot = ggplot(ThreeYearPheno, aes(x = Env, y = Yield_Mg_ha)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ) yield.plot outliers = group_by(ThreeYearPheno, Env) %>% summarise(Mean.std = mean(Yield_Mg_ha), IQR.std = IQR(Yield_Mg_ha)) %>% mutate(top.iqr = Mean.std + 2.5*IQR.std, bot.iqr = Mean.std - 2.5*IQR.std, top.perc = Mean.std*1.3, bot.perc = Mean.std*0.7) outliers.iqr = merge(outliers, ThreeYearPheno, by = "Env", all = T) %>% filter(Yield_Mg_ha < bot.iqr | Yield_Mg_ha > top.iqr) outliers.perc = merge(outliers, ThreeYearPheno, by = "Env", all = T) %>% filter(Yield_Mg_ha < bot.perc | Yield_Mg_ha > top.perc) outliers.both = rbind(outliers.iqr, outliers.perc) %>% filter(duplicated(.)) #keep only rows that are duplicated in both! (yield.plot + geom_point(data = outliers.iqr, colour = "red") + ggtitle("Outliers based on 2.5 IQR")) (yield.plot + geom_point(data = outliers.perc, colour = "green") + ggtitle("Outliers based on percentage of mean")) (yield.plot + geom_point(data = outliers.both, colour = "magenta") + ggtitle("Outliers based on percentage of mean AND IQR")) #Sanity check with earlier email to Jim -changed filtering order ever so slightly, so the super high yields were already gone (8 points -email from 6-19-2018). #Moving forward using IQR to filter -Percentile is too stringent for yield. Will remove 115 observations (magenta/red obs from graphs, +/- 2.5 IQR) ThreeYearPheno <- ThreeYearPheno[!(ThreeYearPheno$RecId %in% outliers.iqr$RecId),] ``` ```{r} #Remake the boxplot with the new yield filtering. Send to Jim (yield.plot_new = ggplot(ThreeYearPheno, aes(x = Env, y = Yield_Mg_ha)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ) yield.plot_new ``` #output the Phenotype Data, 32095 observations -All have yield values. Heck yeah. ```{r} #Remove some of the extra variables used during filtering ThreeYearPheno <- ThreeYearPheno[, -c(44:47)] #Drop ARH2 as directed by Jim. Yields are horrible and likely unreliable. Down to 31787 Observations ThreeYearPheno <- ThreeYearPheno[!(ThreeYearPheno$Env == "ARH2_2016"),] write.csv(ThreeYearPheno, file = "G:/My Drive/Anna/G2F Phenotype Data/G2F_FilteredForStandCount_YieldToMetric_06192018.csv", row.names = FALSE, quote = FALSE) ThreeYearPheno <- read.csv("G:/My Drive/Anna/G2F Phenotype Data/G2F_FilteredForStandCount_YieldToMetric_06192018.csv", header = TRUE) #Make edit to reps in IAH1a/b so that each experiment has unique reps and blocks ThreeYearPheno$Rep[ThreeYearPheno$Experiment == "IAH1b" & ThreeYearPheno$Rep == 1] <- 3 ThreeYearPheno$Rep[ThreeYearPheno$Experiment == "IAH1b" & ThreeYearPheno$Rep == 2] <- 4 ThreeYearPheno$Block[ThreeYearPheno$Experiment == "IAH1b"] <- ThreeYearPheno$Block[ThreeYearPheno$Experiment == "IAH1b"] + 13 #Make similar edit for NEH1 and NEH4 in 2015 ThreeYearPheno$Rep[ThreeYearPheno$Experiment == "NEH4" & ThreeYearPheno$Year == 2015 & ThreeYearPheno$Rep == 1] <- 3 ThreeYearPheno$Rep[ThreeYearPheno$Experiment == "NEH4" & ThreeYearPheno$Year == 2015 & ThreeYearPheno$Rep == 2] <- 4 ThreeYearPheno$Block[ThreeYearPheno$Experiment == "NEH4" & ThreeYearPheno$Year == 2015] <- ThreeYearPheno$Block[ThreeYearPheno$Experiment == "NEH4" & ThreeYearPheno$Year == 2015] + 10 #Make a third, similar edit for MOH1 and MOH2 in 2015 ThreeYearPheno$Rep[ThreeYearPheno$Experiment == "MOH2" & ThreeYearPheno$Year == 2015 & ThreeYearPheno$Rep == 1] <- 3 ThreeYearPheno$Rep[ThreeYearPheno$Experiment == "MOH2" & ThreeYearPheno$Year == 2015 & ThreeYearPheno$Rep == 2] <- 4 ThreeYearPheno$Block[ThreeYearPheno$Experiment == "MOH2" & ThreeYearPheno$Year == 2015] <- ThreeYearPheno$Block[ThreeYearPheno$Experiment == "MOH2" & ThreeYearPheno$Year == 2015] + 10 write.csv(ThreeYearPheno, file = "G:/My Drive/Anna/G2F Phenotype Data/G2F_FilteredForStandCount_YieldToMetric_07162018.csv", row.names = FALSE, quote = FALSE) ``` A Few more problems have been noted, specifically with DEH1_2014 ```{r} ThreeYearPheno <- read.csv(file = "G:/My Drive/Anna/G2F Phenotype Data/G2F_FilteredForStandCount_YieldToMetric_07162018.csv", header = TRUE) #Bring in the Raw DEH data from RAndy DE_2014 <- read.csv(file = "G:/My Drive/Anna/G2F Phenotype Data/G2FDE.csv", header = TRUE) str(DE_2014) #PWT = plot Weight #GMT = Grain Moisture #TWT = Test weight #We need to essentially redo prior filtering with DEH1_2014 on its own. #First let's get all the columns to be the right names names(ThreeYearPheno) names(DE_2014) names(DE_2014) <- c("State", "Experiment", "Entry", "Code", "Source", "Pedigree", "Alias", "Seed", "Rep", "Plot", "Discarded", "Filler", "Stand.Count..plants", "Plot.Weight..lbs", "Grain.Moisture..percent", "Yield", "Test.Weight..lbs.bu.", "Stalk.lodging..plants.", "Root.lodging..plants.", "DMF", "DFF", "Ear.height..cm.", "Plant.Height..cm.", "Comments") DE_2014$State <- as.character(DE_2014$State) DE_2014$State <- "DE" DE_2014$Experiment <- as.character("DEH1_2014") DE_2014$Year <- 2014 DE_2014$Pedigree <- toupper(as.character(DE_2014$Pedigree)) str(DE_2014) DE_2014 <- DE_2014[!(DE_2014$Discarded == "Yes") | DE_2014$Comment == "" | DE_2014$Filler == "",] #Remove plots indicated by Randy for discard. DE_2014$Stand_per_area ``` Start Making the Z Matrix ```{r} #Make the design matrix to pair with an A, D or M matrix Z <- ThreeYearPheno[,c(1,2,8,11)] head(Z) #Z <- Z[order(Z$Line),] #Used as a check during workout programming to see if the zeroes and ones align Z <- model.matrix(~ 0 + Line, data = Z) attr(Z, which = "Hybrid") <- ThreeYearPheno$Line attr(Z, which = "Experiment") <- ThreeYearPheno$Experiment attr(Z, which = "Year") <- ThreeYearPheno$Year attr(Z, which = "Experiment_Year") <- ThreeYearPheno$Env #Output the Line Design Matrix write.table(Z, "C:/Users/Spiro/Google Drive/Anna/G2F_Hybrid_Genomic_Data_To_Analysis/ZMatrix.txt", row.names = FALSE) #Make the Z matrix for environment Z.E <- ThreeYearPheno [,c(1,2,8,11,39)] Z.E <- model.matrix(~ 0 + Env, data = Z.E) attr(Z.E, which = "Hybrid") <- ThreeYearPheno$Line attr(Z.E, which = "Experiment") <- ThreeYearPheno$Experiment attr(Z.E, which = "Year") <- ThreeYearPheno$Year attr(Z.E, which = "Experiment_Year") <- ThreeYearPheno$Env #Output the Environmental Design Matrix write.table(Z.E, "C:/Users/Spiro/Google Drive/Anna/G2F_Hybrid_Genomic_Data_To_Analysis/ZMatrix_Environment.txt", row.names = FALSE) #Need more memory allocation to make this design matrix... Z.GxE <- ThreeYearPheno [,c(1,2,8,11,39)] Z.GxE <- model.matrix(~ 0 + Line*Env, data = Z.GxE) attr(Z.GxE, which = "Hybrid") <- ThreeYearPheno$Line attr(Z.GxE, which = "Experiment") <- ThreeYearPheno$Experiment attr(Z.GxE, which = "Year") <- ThreeYearPheno$Year attr(Z.GxE, which = "Experiment_Year") <- ThreeYearPheno$Env write.table(Z.GxE, "C:/Users/Spiro/Google Drive/Anna/G2F_Hybrid_Genomic_Data_To_Analysis/ZMatrix_GxETerm.txt", row.names = FALSE) #Code the Z.GxE matrix and Z.E matrix summary(Z[,1]) ```