############################################## ########## TRA1 RNA SEQ ANALYSIS ############# ############## Matthew D. Berg ############### ############################################## #Packages needed to run this code library(tidyverse) library(compositions) library(zCompositions) library(pheatmap) library(RColorBrewer) library(ggsci) library(DESeq2) library(reshape2) library(stringr) ###################################################### ########## S.CEREVISIAE RNA SEQ ANALYSIS ############# ###################################################### # Read in data from featureCounts data <- read.table("tra1q3Readtable.txt", header = T, sep = "\t") GeneNames <- read.delim("YeastCommonSystematicNames.txt", header = T, sep = "\t") # Rename column headers to something more informative colnames(data) <- c("Systematic", "Chr", "Start", "End", "Strand", "Length", "Q313.Fwd", "Q313.Paired", "Q314.Fwd", "Q314.Paired", "Q316.Fwd", "Q316.Paired", "WT10.Fwd", "WT10.Paired", "WT11.Fwd", "WT11.Paired", "WT12.Fwd", "WT12.Paired") # Add together counts from paired and unpaired reads and select only columns of interest RawCounts <- data %>% mutate(WT10 = WT10.Fwd + WT10.Paired, WT11 = WT11.Fwd + WT11.Paired, WT12 = WT12.Fwd + WT12.Paired, Q313 = Q313.Fwd + Q313.Paired, Q314 = Q314.Fwd + Q314.Paired, Q316 = Q316.Fwd + Q316.Paired) %>% dplyr::select(Systematic, WT10, WT11, WT12, Q313, Q314, Q316) # CLR Transformation clr <- RawCounts rownames(clr) <- clr$Systematic clr <- clr[-1] clr.gt0 <- clr[apply(clr,1,sum) > 0,] #remove rows with all 0 counts clr.n0 <- cmultRepl(t(clr.gt0), method = "CZM", label = 0) #estimate 0 values clr.n0.trans <- t(apply(clr.n0, 1, function(x){log2(x) - mean(log2(x))})) #clr transformation pcx <- prcomp(clr.n0.trans) #SVD mvar.clr <- mvar(clr.n0.trans) #metric variance df_pcx <- as.data.frame(pcx$x) df_pcx$Strain <- c(rep("WT", 3), rep("tra1Q3", 3)) df_pcx$Sample <- rownames(df_pcx) ggplot(df_pcx, aes(x = PC1, y = PC2, color = Strain, label = Sample)) + geom_label(size = 3) + xlab(paste("PC1 (", (round(sum(pcx$sdev[1]^2)/mvar.clr, 3)*100), "%)")) + ylab(paste("PC2 (", (round(sum(pcx$sdev[2]^2)/mvar.clr, 3)*100), "%)")) + scale_color_npg() + theme_bw() # Form Scree Plot plot((pcx$sdev^2/mvar(clr.n0.trans))[1:10], type = "b", xlab = "Factor", ylab = "Explained") ## Heatmap of the count matrix breaklist = seq(-5, 5, by = 0.1) colors <- colorRampPalette(rev(brewer.pal(n = 7, name = "PuOr")))(length(breaklist)) pheatmap(clr.n0.trans, show_colnames = FALSE, col = colors, breaks = breaklist, annotation_legend = FALSE) ## Heatmap of the sample-to-sample distances sampleDists <- dist(clr.n0.trans) matrix.sampleDists <- as.matrix(sampleDists) color <- colorRampPalette(rev(brewer.pal(9, "Blues")))(100) pheatmap(matrix.sampleDists, clustering_distance_cols = sampleDists, clustering_distance_rows = sampleDists, col = color) ##### DESeq2 Analysis of Differentially Expressed Transcripts ##### tra1q3 <- RawCounts %>% dplyr::select(Systematic, WT10, WT11, WT12, Q313, Q314, Q316) rownames(tra1q3) <- tra1q3$Systematic tra1q3 <- tra1q3[-1] Q3.DEseq <- tra1q3 coldata <- read.table("Q3coldata.txt", header = T, sep = "\t") Q3.dds <- DESeqDataSetFromMatrix(countData = Q3.DEseq, colData = coldata, design = ~Condition) Q3.dds.DE <- DESeq(Q3.dds) Q3.results <- results(Q3.dds.DE) Q3.results.Ordered <- Q3.results[order(Q3.results$padj),] summary(Q3.results) plotMA(Q3.results, ylim = c(-2,2)) Q3.plotting <- as.data.frame(Q3.results) Q3.plotting <- mutate(Q3.plotting, Systematic = rownames(Q3.plotting)) %>% left_join(GeneNames, by = "Systematic") %>% dplyr::select(baseMean, log2FoldChange, lfcSE, stat, pvalue, padj, Systematic, Common) ggplot(Q3.plotting, aes(x = -(log2FoldChange), y = -log10(padj), label = Common)) + geom_point(size = 2, color = if_else(Q3.plotting$padj < 0.05 & Q3.plotting$log2FoldChange > 0, "blue", if_else(Q3.plotting$padj < 0.05 & Q3.plotting$log2FoldChange < 0, "lightblue", "grey"))) + theme_bw(base_size = 16) + xlab("Log2(Fold Change)") + ylab("-Log10(Q-Value)") + geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") + geom_text_repel(data = subset(Q3.plotting, (log2FoldChange > 2 & padj < 0.05)), nudge_x = -10, direction = "y", hjust = 0.5, segment.size = 0.2) + geom_text_repel(data = subset(Q3.plotting, (log2FoldChange < -2 & padj < 0.05)), nudge_x = 10, direction = "y", hjust = 0.1, segment.size = 0.25) Q3.Counts <- as.data.frame(counts(Q3.dds.DE)) Q3.Counts <- mutate(Q3.Counts, Systematic = rownames(Q3.Counts)) %>% left_join(GeneNames, by = "Systematic") #################################################### ########## C.ALBICANS RNA SEQ ANALYSIS ############# #################################################### # Read in data from featureCounts data <- read.table("RNAseq_Tra1Candida.txt", header = T, sep = "\t") GeneNames <- read.table("ORFtoGeneName_CalbicansA21.txt", header = T, sep = "\t") # Rename column headers to something more informative colnames(data) <- c("Systematic", "Chr", "Start", "End", "Strand", "Length", "MutCASP1.Fwd", "MutCASP1.Paired", "MutCASP2.Fwd", "MutCASP2.Paired", "MutCASP3.Fwd", "MutCASP3.Paired", "MutND1.Fwd", "MutND1.Paired", "MutND2.Fwd", "MutND2.Paired", "MutND3.Fwd", "MutND3.Paired", "WTCASP1.Fwd", "WTCASP1.Paired", "WTCASP2.Fwd", "WTCASP2.Paired", "WTCASP3.Fwd", "WTCASP3.Paired", "WTND1.Fwd", "WTND1.Paired", "WTND2.Fwd", "WTND2.Paired", "WTND3.Fwd", "WTND3.Paired") # Add together counts from paired and unpaired reads and select only columns of interest RawCounts <- data %>% mutate(MutCASP1 = MutCASP1.Fwd + MutCASP1.Paired, MutCASP2 = MutCASP2.Fwd + MutCASP2.Paired, MutCASP3 = MutCASP3.Fwd + MutCASP3.Paired, WTCASP1 = WTCASP1.Fwd + WTCASP1.Paired, WTCASP2 = WTCASP2.Fwd + WTCASP2.Paired, WTCASP3 = WTCASP3.Fwd + WTCASP3.Paired, MutND1 = MutND1.Fwd + MutND1.Paired, MutND2 = MutND2.Fwd + MutND2.Paired, MutND3 = MutND3.Fwd + MutND3.Paired, WTND1 = WTND1.Fwd + WTND1.Paired, WTND2 = WTND2.Fwd + WTND2.Paired, WTND3 = WTND3.Fwd + WTND3.Paired) %>% dplyr::select(Systematic, MutCASP1, MutCASP2, MutCASP3, WTCASP1, WTCASP2, WTCASP3, MutND1, MutND2, MutND3, WTND1, WTND2, WTND3) ### CLR Transformation ### clr <- RawCounts rownames(clr) <- clr$Systematic clr <- clr[-1] clr.gt0 <- clr[apply(clr,1,sum) > 0,] #remove rows with all 0 counts clr.n0 <- cmultRepl(t(clr.gt0), method = "CZM", label = 0) #estimate 0 values clr.n0.trans <- t(apply(clr.n0, 1, function(x){log2(x) - mean(log2(x))})) #clr transformation pcx <- prcomp(clr.n0.trans) #SVD mvar.clr <- mvar(clr.n0.trans) #metric variance df_pcx <- as.data.frame(pcx$x) df_pcx$Drug <- c(rep("CASP", 6), rep("ND", 6)) df_pcx$Strain <- c(rep("MutCASP", 3), rep("WTCASP", 3), rep("MutND", 3), rep("WTND", 3)) df_pcx$Sample <- rownames(df_pcx) # Form PCA plot ggplot(df_pcx, aes(x = PC1, y = PC2, color = Strain, label = Sample)) + geom_point(size = 3) + xlab(paste("PC1: ", round(sum(pcx$sdev[1]^2)/mvar.clr, 3))) + ylab(paste("PC2: ", round(sum(pcx$sdev[2]^2)/mvar.clr, 3))) + scale_color_npg() + theme_bw() # Form Scree Plot plot((pcx$sdev^2/mvar(clr.n0.trans))[1:10], type = "b", xlab = "Factor", ylab = "Explained") ## Heatmap of the count matrix samplecol <- data.frame(sample = c(rep("CASP", 6), rep("ND", 6))) samplecol$Strain <- c(rep("Mut", 3), rep("WT", 3), rep("Mut",3), rep("WT",3)) rownames(samplecol) <- rownames(clr.n0.trans) samplecolColor = list(ID = c(Mut = "red", WT = "green")) breaklist = seq(-5, 5, by = 0.1) colors <- colorRampPalette(rev(brewer.pal(n = 7, name = "PuOr")))(length(breaklist)) pheatmap(clr.n0.trans, show_colnames = FALSE, col = colors, breaks = breaklist, annotation_legend = FALSE) ## Heatmap of the sample-to-sample distances sampleDists <- dist(clr.n0.trans) matrix.sampleDists <- as.matrix(sampleDists) color <- colorRampPalette(rev(brewer.pal(9, "Blues")))(100) pheatmap(matrix.sampleDists, clustering_distance_cols = sampleDists, clustering_distance_rows = sampleDists, col = color) ##### DESeq2 Analysis of Differentially Expressed Transcripts ##### all <- RawCounts %>% dplyr::select(Systematic, WTND1, WTND2, WTND3, MutCASP1, MutCASP2, MutCASP3, WTCASP1, WTCASP2, WTCASP3, MutND1, MutND2, MutND3) rownames(all) <- all$Systematic all <- all[-1] DEseq <- all coldata <- read.table("twofactorcols.txt", header = T, sep = "\t") all.dds <- DESeqDataSetFromMatrix(countData = DEseq, colData = coldata, design = ~ Condition + Genotype + Genotype:Condition) all.dds$Condition <- relevel(all.dds$Condition, "WT") all.dds$Genotype <- relevel(all.dds$Genotype, "WT") all.dds.DE <- DESeq(all.dds) all.results.Names <- resultsNames(all.dds.DE) #Effect of caspofungin treatment in WT main <- results(all.dds.DE, contrast = c("Condition", "CASP", "WT")) main <- main[order(main$padj),] significant.WTcondition <- main %>% as.data.frame %>% mutate(ORF = rownames(main)) %>% filter(padj <= 0.05) #Effect of caspofungin treatment on Q3 cells conditionQ3 <- results(all.dds.DE, contrast = list(c("Condition_CASP_vs_WT", "ConditionCASP.GenotypeMutant"))) conditionQ3 <- conditionQ3[order(conditionQ3$padj),] significant.Q3condition <- conditionQ3 %>% as.data.frame %>% mutate(ORF = rownames(conditionQ3)) %>% filter(padj <= 0.05) #Difference between WT and Q3 without caspofungin treatment Q3.notreat <- results(all.dds.DE, contrast = c("Genotype", "Mutant", "WT")) Q3.notreat <- Q3.notreat[order(Q3.notreat$padj),] significant.NDgenotype <- Q3.notreat %>% as.data.frame %>% mutate(ORF = rownames(Q3.notreat)) %>% filter(padj <= 0.05) #Different between WT and Q3 with caspofungin treatment Q3.treat <- results(all.dds.DE, contrast = list(c("Genotype_Mutant_vs_WT", "ConditionCASP.GenotypeMutant"))) Q3.treat <- Q3.treat[order(Q3.treat$padj),] significant.CASPgenotype <- Q3.treat %>% as.data.frame %>% mutate(ORF = rownames(Q3.treat)) %>% filter(padj <= 0.05 & log2FoldChange < 0) # Different response in genotypes # Tests if the condition effect is different in WT compared to Q3 interaction <- results(all.dds.DE, name = "ConditionCASP.GenotypeMutant") interaction <- interaction[order(interaction$padj),] significant.interaction <- interaction %>% as.data.frame %>% mutate(ORF = rownames(interaction)) %>% filter((log2FoldChange > 1 | log2FoldChange < -1) & padj <= 0.05) interaction.plotting <- as.data.frame(interaction) ggplot(interaction.plotting, aes(x = -(log2FoldChange), y = -log10(padj))) + geom_point(size = 2, color = if_else(interaction.plotting$padj < 0.05 & interaction.plotting$log2FoldChange > 1, "darkred", if_else(interaction.plotting$padj < 0.05 & interaction.plotting$log2FoldChange < -1, "red", "grey"))) + theme_bw(base_size = 16) + xlab("Log2(Fold Change)") + ylab("-Log10(Q-Value)") + geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") + coord_cartesian(xlim = c(-4, 4)) ## Normalized counts GLM.counts <- as.data.frame(counts(all.dds.DE, normalize = T)) GLM.counts$ORF <- rownames(GLM.counts) GLM.counts.names <- GLM.counts write.csv(GLM.counts.names, "Tra1CASP_RNASeq_NormalizedCounts.csv") ### EVP1 Normalized Counts Plot ### Count <- counts(all.dds.DE) Evp1 <- Count["orf19.6741",] Evp1.melt <- melt(Evp1) Evp1.melt$Type <- c(rep("WTND", 3), rep("MutCASP", 3), rep("WTCASP", 3), rep("MutND", 3)) Evp1.melt$Type <- factor(Evp1.melt$Type, levels = c("WTND", "MutND", "WTCASP", "MutCASP")) ggplot(Evp1.melt, aes(y = value, x = Type)) + geom_jitter(width = 0.15, size = 3) + theme_bw(base_size = 16) + ylab("Normalized Counts") + xlab("") + stat_compare_means(comparisons = testcomparisons, method = "t.test")