#Beet Geosmin Phenotypic Analysis #Chr 8 is associated with geosmin in beet #Submitted to G3 -- July 2021 #Solveig Hanson, Julie Dawson, Irwin Goldman # INSTALL PACKAGES------------------------------------- install.packages('outliers') install.packages('pastecs') install.packages("dplyr", dependencies = TRUE) install.packages("ggplot2", dependencies=TRUE) install.packages('lme4', dependencies = TRUE) install.packages('emmeans', dependencies=TRUE) install.packages('lattice') install.packages('car') library(outliers) library(pastecs) library(dplyr) library(ggplot2) library(lme4) library(lmerTest) library(emmeans) library(lattice) library(car) library(tidyr) library(grid) library(gridExtra) ##LOAD DATA - FULL POPULATION ------------------------------------------------- #load data set with Block (A and B in each year) and Block2 (A-D) columns Genomics_Geo <- read.csv("S2_Genomics_Geosmin_Master_2.csv", header = T) # basic data examination str(Genomics_Geo) # change class of vectors Genomics_Geo$Root <- as.factor(Genomics_Geo$Root) Genomics_Geo$Year <- as.factor(Genomics_Geo$Year) Genomics_Geo$Family <- as.factor(Genomics_Geo$Family) Genomics_Geo$Row <- as.factor(Genomics_Geo$Row) Genomics_Geo$Block <- as.factor(Genomics_Geo$Block) Genomics_Geo$Block2 <- as.factor(Genomics_Geo$Block2) # confirm changes str(Genomics_Geo) ############DATA SUBSET CREATION - CHECKS##################-------------------- Checks <- Genomics_Geo %>% filter(Check=="C") %>% droplevels() str(Checks) TG <- Checks %>% filter(Family=="TG") %>% droplevels() Chiog <- Checks %>% filter(Family=="Chiog ") %>% droplevels() ###########DATA SUBSET CREATION - F3 INDIVIDUALS################################ ##F3 Individuals by Family & Year F3 <- Genomics_Geo %>% filter(Check=="") %>% droplevels() str(F3) summary(F3) #569 roots total #F3 Individuals with roots present in both years F3_2yr <- F3 %>% filter(!Family %in% c("4042", "4158", "4165")) %>% dplyr::select(Year:Ln_Geosmin) str(F3_2yr) summary(F3_2yr) #557 roots #no roots from excluded families were selected for genotyping ##F3 individuals from families with at least 2 roots per year F3_2yr_2per <- F3_2yr %>% filter(!Family %in% c("4008", "4140", "4146")) #542 roots str(F3_2yr_2per) summary(F3_2yr_2per) #4 roots from excluded families (2 from 4146, 2 from 4140) were selected for genotyping #in both cases, root with leverage was the only one from 2017 ##LOAD DATA - SELECTED INDIVIDUALS ----------------------------------------------- #load data set Genomics_Sel <- read.csv("S3_pheno_geo_v3.csv", header = T) # basic data examination # str shows the structure of a data set str(Genomics_Sel) # change class of vectors Genomics_Sel$Fam <- as.factor(Genomics_Sel$Fam) Genomics_Sel$Year <- as.factor(Genomics_Sel$Year) # confirm changes str(Genomics_Sel) #Individuals per family per year Indiv_Fam_Yr_Sel <- table(Genomics_Sel$Fam, Genomics_Sel$Year) write.csv(Indiv_Fam_Yr_Sel, file = "Sel Indiv Table.csv", row.names=TRUE) #individuals per tail Genomics_Sel %>% group_by(Tail) %>% dplyr::count() #individuals per tail and year Genomics_Sel %>% group_by(Tail, Year) %>% dplyr::count() #Individuals per family Indiv_Fam_Sel <- Genomics_Sel %>% group_by(Fam) %>% dplyr::count() #1 to 6 individuals per family table(Indiv_Fam_Sel$n) write.csv(Indiv_Fam_Sel, file = "Sel Indiv Table Total.csv", row.names=TRUE) Genomics_Sel %>% summarise(n_distinct(Fam)) #24 families #Individuals per tail per year Indiv_Tail_Yr_Sel <- table(Genomics_Sel$Tail, Genomics_Sel$Year) write.csv(Indiv_Tail_Yr_Sel, file = "Sel Indiv Tails.csv", row.names=TRUE) #mean geosmin by selected family Sel_Geo <- aggregate(Indiv_Geo ~ Fam, data=Genomics_Sel, mean) Sel_Geo$Indiv_Ln_Geo <- log(Sel_Geo$Indiv_Geo) write.csv(Sel_Geo, file = "Sel Indiv Mean Geo by Fam.csv", row.names=TRUE) #G3 FIGURE 1 - F3 INDIVIDUAL HISTOGRAMS ------------------------------------------------------------- #microgram symbol example g_mu <- bquote("Geosmin concentration"~"("*mu*"g"%.%"kg" ^-1*")") ln_g_mu <- bquote("Ln geosmin concentration"~"(ln"~mu*"g"%.%"kg" ^-1*")") #G3 FIGURE 1A - FULL POPULATION - UNTRANSFORMED INDIV ROOT GEOSMIN #find year means yr_mean_geo <- F3_2yr_2per %>% group_by(Year) %>% summarise(grp.mean=mean(Geosmin)) #histogram - geosmin by year hist_geo <- ggplot(F3_2yr_2per, aes(x=Geosmin, color=Year, fill=Year)) + ylim(0,40)+ geom_histogram(binwidth=1.5, alpha = 0.5, position="identity")+ geom_vline(data=yr_mean_geo, aes(xintercept=grp.mean, color=Year), linetype="dashed", size=1, show.legend=F) + scale_color_brewer(palette="Accent")+ scale_fill_brewer(palette="Accent")+ geom_vline(xintercept = 9.49, linetype="dashed", color="red", size=1)+ geom_vline(xintercept = 8.02, linetype="dashed", color="orange", size=1)+ labs(title="(A)",x=g_mu, y = "Count")+ annotate(geom="text", x=7, y=35, label="8.02", color="orange", size=3.5)+ annotate(geom="text", x=8.8, y=36.5, label="9.49", color="red", size=3.5)+ annotate(geom="text", x=11, y=38, label="10.08", color="dark green", size=3.5)+ annotate(geom="text", x=12, y=39.8, label="10.72", color="purple", size=3.5)+ theme(legend.position = "none") hist_geo #FIGURE 1B - SELECTED POPULATIONS - UNTRANSFORMED INDIV ROOT GEOSMIN #find tail means yr_mean_geo_sel <- Genomics_Sel %>% group_by(Tail) %>% summarise(grp.mean=mean(Indiv_Geo)) #histogram - geosmin bytail hist_geo_sel <- ggplot(Genomics_Sel, aes(x=Indiv_Geo, color=Tail, fill=Tail)) + ylim(0,25)+ geom_histogram(binwidth=1.5, alpha = 0.5, position="identity")+ geom_vline(data=yr_mean_geo_sel, aes(xintercept=grp.mean, color=Tail), linetype="dashed", size=1, show.legend=F) + scale_color_brewer(palette="Accent")+ scale_fill_brewer(palette="Accent")+ geom_vline(xintercept = 9.49, linetype="dashed", color="red",size=1)+ geom_vline(xintercept = 8.02, linetype="dashed", color="orange",size=1)+ labs(title="(B)",x=g_mu, y = "Count")+ annotate(geom="text", x=1.2, y=24, label="2.80", color="purple", size=3.5)+ annotate(geom="text", x=7, y=24, label="8.02", color="orange", size=3.5)+ annotate(geom="text", x=11, y=24, label="9.49", color="red", size=3.5)+ annotate(geom="text", x=24, y=24, label="22.60", color="dark green", size=3.5)+ theme(legend.position = "none") hist_geo_sel ------------------------ #G3 FIGURE 1A - FULL POPULATION - UNTRANSFORMED INDIV ROOT GEOSMIN #find year means yr_mean_geo <- F3_2yr_2per %>% group_by(Year) %>% summarise(grp.mean=mean(Geosmin)) #histogram - geosmin by year hist_geo <- ggplot(F3_2yr_2per, aes(x=Geosmin, color=Year, fill=Year)) + ylim(0,40)+ geom_histogram(binwidth=1.5, alpha = 0.5, position="identity")+ geom_vline(data=yr_mean_geo, aes(xintercept=grp.mean, color=Year), linetype="dashed", size=1, show.legend=F) + scale_color_brewer(palette="Accent")+ scale_fill_brewer(palette="Accent")+ geom_vline(xintercept = 9.49, linetype="dashed", color="red", size=1)+ geom_vline(xintercept = 8.02, linetype="dashed", color="orange", size=1)+ labs(title="(A)",x=g_mu, y = "Count")+ annotate(geom="text", x=7, y=35, label="8.02", color="orange", size=3.5)+ annotate(geom="text", x=8.8, y=36.5, label="9.49", color="red", size=3.5)+ annotate(geom="text", x=11, y=38, label="10.08", color="dark green", size=3.5)+ annotate(geom="text", x=12, y=39.8, label="10.72", color="purple", size=3.5)+ theme(legend.position = "none") hist_geo #FIGURE 1B - SELECTED POPULATIONS - UNTRANSFORMED INDIV ROOT GEOSMIN #find tail means yr_mean_geo_sel <- Genomics_Sel %>% group_by(Tail) %>% summarise(grp.mean=mean(Indiv_Geo)) #histogram - geosmin bytail hist_geo_sel <- ggplot(Genomics_Sel, aes(x=Indiv_Geo, color=Tail, fill=Tail)) + ylim(0,25)+ geom_histogram(binwidth=1.5, alpha = 0.5, position="identity")+ geom_vline(data=yr_mean_geo_sel, aes(xintercept=grp.mean, color=Tail), linetype="dashed", size=1, show.legend=F) + scale_color_brewer(palette="Accent")+ scale_fill_brewer(palette="Accent")+ geom_vline(xintercept = 9.49, linetype="dashed", color="red",size=1)+ geom_vline(xintercept = 8.02, linetype="dashed", color="orange",size=1)+ labs(title="(B)",x=g_mu, y = "Count")+ annotate(geom="text", x=1.2, y=24, label="2.80", color="purple", size=3.5)+ annotate(geom="text", x=7, y=24, label="8.02", color="orange", size=3.5)+ annotate(geom="text", x=11, y=24, label="9.49", color="red", size=3.5)+ annotate(geom="text", x=24, y=24, label="22.60", color="dark green", size=3.5)+ theme(legend.position = "none") hist_geo_sel #G3 FIGURE 1 - HISTOGRAMS OF FULL POP AND SELECTED INDIVIDUALS - UNTRANSFORMED GEOSMIN grid.arrange(hist_geo, hist_geo_sel, nrow=2, ncol=1) #########RCBD WITH SUBSAMPLING FOR ALL FAMILIES WITH 2 OR MORE SAMPLES IN BOTH YEARS################# ----------------------------- #####TABLE 1 G3 RESUBMISSION JULY 2021 #boxplot boxplot(Ln_Geosmin~Year, data=F3_2yr_2per) boxplot(Ln_Geosmin~Family, data=F3_2yr_2per) # scatterplot plot(F3_2yr_2per$Ln_Geosmin) #test most extreme value for outlier status grubbs.test(F3_2yr_2per$Ln_Geosmin, type=10) #RCBD mixed model with subsampling and block nested in year #Year/Block pools variation from Block:Year interaction lm1 <- lmer(Ln_Geosmin ~ Year/Block + Year + Family +Year:Family + (1|Block:Year:Family), data=F3_2yr_2per, na.action = na.exclude) print(summary(lm1),correlation=FALSE) anova(lm1) ranova(lm1) isSingular(lm1) #check model assumptions - heteroscedacicity #residuals vs fitted plot plot(lm1) ## box-plots of residuals by fam & year boxplot(residuals(lm1) ~ F3_2yr_2per$Family) boxplot(residuals(lm1) ~ F3_2yr_2per$Year) #levene's test leveneTest(Ln_Geosmin ~ Family, F3_2yr_2per, center=mean) leveneTest(Ln_Geosmin ~ Year, F3_2yr_2per, center=mean) #check model assumptions - influential points #from car package - overwrites lme4 influence functions influencePlot(lm1) #FOR SUPPLEMENTARY TABLE 1 #get estimated marginal means - Fam main effect emm1 <- emmeans(lm1, ~Family) summary(emm1) #pairwise comparison table using CLD paircomps.CLD <-multcomp::cld(emm1, alpha=0.05, sort=T, reversed=T, Letters=letters) print(paircomps.CLD) #MEAN GEOSMIN CONCENTRATION BY FAMILY - REVISED FOR F3_2yr_2per DATASET ------------------- #2017 and 2019 combined Fam_Mean_Geo <- data.frame(aggregate(F3_2yr_2per$Geosmin, by=list(F3_2yr_2per$Family), FUN=mean)) cnames_fam <- c("Family", "Mean_Geosmin") colnames(Fam_Mean_Geo) <- cnames_fam Fam_Mean_Geo$Mean_Geosmin <- round(Fam_Mean_Geo$Mean_Geosmin, 2) #2017 family means F3_2yr_2per_17 <- F3_2yr_2per %>% filter(Year==2017) Fam_Mean_Geo_17 <- data.frame(aggregate(F3_2yr_2per_17$Geosmin, by=list(F3_2yr_2per_17$Family), FUN=mean)) colnames(Fam_Mean_Geo_17) <- cnames_fam Fam_Mean_Geo_17$Mean_Geosmin <- round(Fam_Mean_Geo_17$Mean_Geosmin, 2) #2019 family means F3_2yr_2per_19 <- F3_2yr_2per %>% filter(Year==2019) Fam_Mean_Geo_19 <- data.frame(aggregate(F3_2yr_2per_19$Geosmin, by=list(F3_2yr_2per_19$Family), FUN=mean)) colnames(Fam_Mean_Geo_19) <- cnames_fam Fam_Mean_Geo_19$Mean_Geosmin <- round(Fam_Mean_Geo_19$Mean_Geosmin, 2) #add 2019 and 2017 mean geosmin to overall mean geosmin df Fam_Mean_Geo$Mean_Geo_2019 <- Fam_Mean_Geo_19$Mean_Geosmin[match(Fam_Mean_Geo$Family, Fam_Mean_Geo_19$Family)] Fam_Mean_Geo$Mean_Geo_2017 <- Fam_Mean_Geo_17$Mean_Geosmin[match(Fam_Mean_Geo$Family, Fam_Mean_Geo_17$Family)] cnames_means <- c("Family", "Mean_Geosmin", "Mean_Geosmin_2019", "Mean_Geosmin_2017") colnames(Fam_Mean_Geo) <- cnames_means #MEAN LN GEOSMIN CONCENTRATION BY FAMILY -- REVISED FOR F3_2yr_2per DATASET------------------------------------ Fam_Mean_Geo_Ln <- data.frame(aggregate(F3_2yr_2per$Ln_Geosmin, by=list(F3_2yr_2per$Family), FUN=mean)) cnames_fam_ln <- c("Family", "Mean_Ln_Geosmin") colnames(Fam_Mean_Geo_Ln) <- cnames_fam_ln Fam_Mean_Geo_Ln$Mean_Ln_Geosmin <- round(Fam_Mean_Geo_Ln$Mean_Ln_Geosmin, 2) #2017 family mean ln geosmin Fam_Mean_Geo_17_Ln <- data.frame(aggregate(F3_2yr_2per_17$Ln_Geosmin, by=list(F3_2yr_2per_17$Family), FUN=mean)) colnames(Fam_Mean_Geo_17_Ln) <- cnames_fam_ln Fam_Mean_Geo_17_Ln$Mean_Ln_Geosmin <- round(Fam_Mean_Geo_17_Ln$Mean_Ln_Geosmin, 2) #2019 family mean ln geosmin Fam_Mean_Geo_19_Ln <- data.frame(aggregate(F3_2yr_2per_19$Ln_Geosmin, by=list(F3_2yr_2per_19$Family), FUN=mean)) colnames(Fam_Mean_Geo_19_Ln) <- cnames_fam_ln Fam_Mean_Geo_19_Ln$Mean_Ln_Geosmin <- round(Fam_Mean_Geo_19_Ln$Mean_Ln_Geosmin, 2) #add 2019 and 2017 geosmin to overall mean geosmin df Fam_Mean_Geo_Ln$Mean_Ln_Geo_2019 <- Fam_Mean_Geo_19_Ln$Mean_Ln_Geosmin[match(Fam_Mean_Geo_Ln$Family, Fam_Mean_Geo_19_Ln$Family)] Fam_Mean_Geo_Ln$Mean_Ln_Geo_2017 <- Fam_Mean_Geo_17_Ln$Mean_Ln_Geosmin[match(Fam_Mean_Geo_Ln$Family, Fam_Mean_Geo_17_Ln$Family)] cnames_means_ln <- c("Family", "Mean_Ln_Geosmin", "Mean_Ln_Geosmin_2019", "Mean_Ln_Geosmin_2017") colnames(Fam_Mean_Geo_Ln) <- cnames_means_ln Fam_Mean_Geo <- merge(Fam_Mean_Geo, Fam_Mean_Geo_Ln, by="Family") write.csv(Fam_Mean_Geo, file = "Family_Mean_Geosmin.csv", row.names=TRUE) #UNTRANSFORMED GEOSMIN VALUES FOR CONSTRUCTION OF SUPPLEMENTARY TABLE 1 #T TESTS FOR F3 INDIVIDUAL MEANS AND F2 FAMILY MEANS BETWEEN YEARS--------------------- t.test(Geosmin ~ Year, data=F3_2yr_2per) t.test(Ln_Geosmin ~ Year, data=F3_2yr_2per) t.test(Mean_Geosmin ~ Year, data=F2_Means) t.test(Mean_Ln_Geosmin ~ Year, data=F2_Means_Ln) #T TESTS FOR CHECK MEANS VS F3 INDIVIDUAL MEANS-------------------- t.test(F3_2yr_2per$Ln_Geosmin, TG$Ln_Geosmin) t.test(F3_2yr_2per$Geosmin, TG$Geosmin) t.test(F3_2yr_2per$Ln_Geosmin, Chiog$Ln_Geosmin) t.test(F3_2yr_2per$Geosmin, Chiog$Geosmin) #T TESTS FOR TAIL MEANS---------------------------------------------- table(Genomics_Sel$Tail) t.test(Indiv_Geo ~ Tail, data=Genomics_Sel) t.test(Indiv_Ln_Geo ~ Tail, data=Genomics_Sel) t.test(Mean_Geosmin ~ Tail, data=Genomics_Sel) t.test(Mean_Ln_Geosmin ~ Tail, data=Genomics_Sel) #T tests for tail means vs relevant checks Low <-droplevels(dplyr::filter(Genomics_Sel, Tail == "Low")) High <-droplevels(dplyr::filter(Genomics_Sel, Tail == "High")) t.test(Low$Indiv_Ln_Geo, TG$Ln_Geosmin) t.test(High$Indiv_Ln_Geo, Chiog$Ln_Geosmin)