##############################################
########## TRA1 RNA SEQ ANALYSIS #############
############## Matthew D. Berg ###############
##############################################

#Packages needed to run this code
library(tidyverse)
library(compositions)
library(zCompositions)
library(pheatmap)
library(RColorBrewer)
library(ggsci)
library(DESeq2)
library(reshape2)
library(stringr)

######################################################
########## S.CEREVISIAE RNA SEQ ANALYSIS #############
######################################################

# Read in data from featureCounts
data <- read.table("tra1q3Readtable.txt", header = T, sep = "\t")
GeneNames <- read.delim("YeastCommonSystematicNames.txt", header = T, sep = "\t")

# Rename column headers to something more informative
colnames(data) <- c("Systematic", "Chr", "Start", "End", "Strand", "Length", "Q313.Fwd", "Q313.Paired", "Q314.Fwd", "Q314.Paired", "Q316.Fwd", "Q316.Paired", "WT10.Fwd", "WT10.Paired", "WT11.Fwd", "WT11.Paired", "WT12.Fwd", "WT12.Paired")

# Add together counts from paired and unpaired reads and select only columns of interest
RawCounts <- data %>%
  mutate(WT10 = WT10.Fwd + WT10.Paired,
         WT11 = WT11.Fwd + WT11.Paired,
         WT12 = WT12.Fwd + WT12.Paired,
         Q313 = Q313.Fwd + Q313.Paired,
         Q314 = Q314.Fwd + Q314.Paired,
         Q316 = Q316.Fwd + Q316.Paired) %>%
  dplyr::select(Systematic, 
                WT10, 
                WT11, 
                WT12, 
                Q313, 
                Q314, 
                Q316)

# CLR Transformation

clr <- RawCounts
rownames(clr) <- clr$Systematic
clr <- clr[-1]

clr.gt0 <- clr[apply(clr,1,sum) > 0,] #remove rows with all 0 counts
clr.n0 <- cmultRepl(t(clr.gt0), method = "CZM", label = 0) #estimate 0 values
clr.n0.trans <- t(apply(clr.n0, 1, function(x){log2(x) - mean(log2(x))})) #clr transformation

pcx <- prcomp(clr.n0.trans) #SVD
mvar.clr <- mvar(clr.n0.trans) #metric variance

df_pcx <- as.data.frame(pcx$x)
df_pcx$Strain <- c(rep("WT", 3), rep("tra1Q3", 3))
df_pcx$Sample <- rownames(df_pcx)
ggplot(df_pcx, aes(x = PC1, y = PC2, color = Strain, label = Sample)) + geom_label(size = 3) + xlab(paste("PC1 (", (round(sum(pcx$sdev[1]^2)/mvar.clr, 3)*100), "%)")) + ylab(paste("PC2 (", (round(sum(pcx$sdev[2]^2)/mvar.clr, 3)*100), "%)")) + scale_color_npg() + theme_bw()

# Form Scree Plot
plot((pcx$sdev^2/mvar(clr.n0.trans))[1:10], type = "b", xlab = "Factor", ylab = "Explained")

## Heatmap of the count matrix

breaklist = seq(-5, 5, by = 0.1)

colors <- colorRampPalette(rev(brewer.pal(n = 7, name = "PuOr")))(length(breaklist))

pheatmap(clr.n0.trans, show_colnames = FALSE, col = colors, breaks = breaklist, annotation_legend = FALSE)

## Heatmap of the sample-to-sample distances
sampleDists <- dist(clr.n0.trans)
matrix.sampleDists <- as.matrix(sampleDists)

color <- colorRampPalette(rev(brewer.pal(9, "Blues")))(100)

pheatmap(matrix.sampleDists, clustering_distance_cols = sampleDists, clustering_distance_rows = sampleDists, col = color)

##### DESeq2 Analysis of Differentially Expressed Transcripts #####

tra1q3 <- RawCounts %>%
  dplyr::select(Systematic, WT10, WT11, WT12, Q313, Q314, Q316)
rownames(tra1q3) <- tra1q3$Systematic
tra1q3 <- tra1q3[-1]

Q3.DEseq <- tra1q3

coldata <- read.table("Q3coldata.txt", header = T, sep = "\t")

Q3.dds <- DESeqDataSetFromMatrix(countData = Q3.DEseq, colData = coldata, design = ~Condition)
Q3.dds.DE <- DESeq(Q3.dds)
Q3.results <- results(Q3.dds.DE)

Q3.results.Ordered <- Q3.results[order(Q3.results$padj),]
summary(Q3.results)

plotMA(Q3.results, ylim = c(-2,2))

Q3.plotting <- as.data.frame(Q3.results) 

Q3.plotting <- mutate(Q3.plotting, Systematic = rownames(Q3.plotting)) %>%
  left_join(GeneNames, by = "Systematic") %>%
  dplyr::select(baseMean, log2FoldChange, lfcSE, stat, pvalue, padj, Systematic, Common)

ggplot(Q3.plotting, aes(x = -(log2FoldChange), y = -log10(padj), label = Common)) + geom_point(size = 2, color = if_else(Q3.plotting$padj < 0.05 & Q3.plotting$log2FoldChange > 0, "blue", if_else(Q3.plotting$padj < 0.05 & Q3.plotting$log2FoldChange < 0, "lightblue", "grey"))) + theme_bw(base_size = 16) + xlab("Log2(Fold Change)") + ylab("-Log10(Q-Value)") + geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") + geom_text_repel(data = subset(Q3.plotting, (log2FoldChange > 2 & padj < 0.05)), nudge_x = -10, direction = "y", hjust = 0.5, segment.size = 0.2) + geom_text_repel(data = subset(Q3.plotting, (log2FoldChange < -2 & padj < 0.05)), nudge_x = 10, direction = "y", hjust = 0.1, segment.size = 0.25)

Q3.Counts <- as.data.frame(counts(Q3.dds.DE))

Q3.Counts <- mutate(Q3.Counts, Systematic = rownames(Q3.Counts)) %>%
  left_join(GeneNames, by = "Systematic")



####################################################
########## C.ALBICANS RNA SEQ ANALYSIS #############
####################################################

# Read in data from featureCounts
data <- read.table("RNAseq_Tra1Candida.txt", header = T, sep = "\t")
GeneNames <- read.table("ORFtoGeneName_CalbicansA21.txt", header = T, sep = "\t")

# Rename column headers to something more informative
colnames(data) <- c("Systematic", "Chr", "Start", "End", "Strand", "Length", "MutCASP1.Fwd", "MutCASP1.Paired", "MutCASP2.Fwd", "MutCASP2.Paired", "MutCASP3.Fwd", "MutCASP3.Paired", "MutND1.Fwd", "MutND1.Paired", "MutND2.Fwd", "MutND2.Paired", "MutND3.Fwd", "MutND3.Paired", "WTCASP1.Fwd", "WTCASP1.Paired", "WTCASP2.Fwd", "WTCASP2.Paired", "WTCASP3.Fwd", "WTCASP3.Paired", "WTND1.Fwd", "WTND1.Paired", "WTND2.Fwd", "WTND2.Paired", "WTND3.Fwd", "WTND3.Paired")

# Add together counts from paired and unpaired reads and select only columns of interest
RawCounts <- data %>%
  mutate(MutCASP1 = MutCASP1.Fwd + MutCASP1.Paired,
         MutCASP2 = MutCASP2.Fwd + MutCASP2.Paired,
         MutCASP3 = MutCASP3.Fwd + MutCASP3.Paired,
         WTCASP1 = WTCASP1.Fwd + WTCASP1.Paired,
         WTCASP2 = WTCASP2.Fwd + WTCASP2.Paired,
         WTCASP3 = WTCASP3.Fwd + WTCASP3.Paired,
         MutND1 = MutND1.Fwd + MutND1.Paired,
         MutND2 = MutND2.Fwd + MutND2.Paired,
         MutND3 = MutND3.Fwd + MutND3.Paired,
         WTND1 = WTND1.Fwd + WTND1.Paired,
         WTND2 = WTND2.Fwd + WTND2.Paired,
         WTND3 = WTND3.Fwd + WTND3.Paired) %>%
  dplyr::select(Systematic, 
         MutCASP1, 
         MutCASP2, 
         MutCASP3, 
         WTCASP1, 
         WTCASP2, 
         WTCASP3, 
         MutND1, 
         MutND2, 
         MutND3, 
         WTND1, 
         WTND2, 
         WTND3)

### CLR Transformation ###

clr <- RawCounts
rownames(clr) <- clr$Systematic
clr <- clr[-1]

clr.gt0 <- clr[apply(clr,1,sum) > 0,] #remove rows with all 0 counts
clr.n0 <- cmultRepl(t(clr.gt0), method = "CZM", label = 0) #estimate 0 values
clr.n0.trans <- t(apply(clr.n0, 1, function(x){log2(x) - mean(log2(x))})) #clr transformation

pcx <- prcomp(clr.n0.trans) #SVD
mvar.clr <- mvar(clr.n0.trans) #metric variance

df_pcx <- as.data.frame(pcx$x)
df_pcx$Drug <- c(rep("CASP", 6), rep("ND", 6))
df_pcx$Strain <- c(rep("MutCASP", 3), rep("WTCASP", 3), rep("MutND", 3), rep("WTND", 3))
df_pcx$Sample <- rownames(df_pcx)

# Form PCA plot
ggplot(df_pcx, aes(x = PC1, y = PC2, color = Strain, label = Sample)) + geom_point(size = 3) + xlab(paste("PC1: ", round(sum(pcx$sdev[1]^2)/mvar.clr, 3))) + ylab(paste("PC2: ", round(sum(pcx$sdev[2]^2)/mvar.clr, 3))) + scale_color_npg() + theme_bw()

# Form Scree Plot
plot((pcx$sdev^2/mvar(clr.n0.trans))[1:10], type = "b", xlab = "Factor", ylab = "Explained")

## Heatmap of the count matrix
samplecol <- data.frame(sample = c(rep("CASP", 6), rep("ND", 6)))
samplecol$Strain <- c(rep("Mut", 3), rep("WT", 3), rep("Mut",3), rep("WT",3))
rownames(samplecol) <- rownames(clr.n0.trans)
samplecolColor = list(ID = c(Mut = "red", WT = "green"))

breaklist = seq(-5, 5, by = 0.1)

colors <- colorRampPalette(rev(brewer.pal(n = 7, name = "PuOr")))(length(breaklist))

pheatmap(clr.n0.trans, show_colnames = FALSE, col = colors, breaks = breaklist, annotation_legend = FALSE)

## Heatmap of the sample-to-sample distances
sampleDists <- dist(clr.n0.trans)
matrix.sampleDists <- as.matrix(sampleDists)

color <- colorRampPalette(rev(brewer.pal(9, "Blues")))(100)

pheatmap(matrix.sampleDists, clustering_distance_cols = sampleDists, clustering_distance_rows = sampleDists, col = color)

##### DESeq2 Analysis of Differentially Expressed Transcripts #####

all <- RawCounts %>%
  dplyr::select(Systematic, WTND1, WTND2, WTND3, MutCASP1, MutCASP2, MutCASP3, WTCASP1, WTCASP2, WTCASP3, MutND1, MutND2, MutND3)
rownames(all) <- all$Systematic
all <- all[-1]

DEseq <- all

coldata <- read.table("twofactorcols.txt", header = T, sep = "\t")

all.dds <- DESeqDataSetFromMatrix(countData = DEseq, colData = coldata, design = ~ Condition + Genotype + Genotype:Condition)
all.dds$Condition <- relevel(all.dds$Condition, "WT")
all.dds$Genotype <- relevel(all.dds$Genotype, "WT")
all.dds.DE <- DESeq(all.dds)
all.results.Names <- resultsNames(all.dds.DE)

#Effect of caspofungin treatment in WT
main <- results(all.dds.DE, contrast = c("Condition", "CASP", "WT"))
main <- main[order(main$padj),]

significant.WTcondition <- main %>%
  as.data.frame %>%
  mutate(ORF = rownames(main)) %>%
  filter(padj <= 0.05)

#Effect of caspofungin treatment on Q3 cells
conditionQ3 <- results(all.dds.DE, contrast = list(c("Condition_CASP_vs_WT", "ConditionCASP.GenotypeMutant")))
conditionQ3 <- conditionQ3[order(conditionQ3$padj),]

significant.Q3condition <- conditionQ3 %>%
  as.data.frame %>%
  mutate(ORF = rownames(conditionQ3)) %>%
  filter(padj <= 0.05)

#Difference between WT and Q3 without caspofungin treatment
Q3.notreat <- results(all.dds.DE, contrast = c("Genotype", "Mutant", "WT"))
Q3.notreat <- Q3.notreat[order(Q3.notreat$padj),]

significant.NDgenotype <- Q3.notreat %>%
  as.data.frame %>%
  mutate(ORF = rownames(Q3.notreat)) %>%
  filter(padj <= 0.05)

#Different between WT and Q3 with caspofungin treatment
Q3.treat <- results(all.dds.DE, contrast = list(c("Genotype_Mutant_vs_WT", "ConditionCASP.GenotypeMutant")))
Q3.treat <- Q3.treat[order(Q3.treat$padj),]

significant.CASPgenotype <- Q3.treat %>%
  as.data.frame %>%
  mutate(ORF = rownames(Q3.treat)) %>%
  filter(padj <= 0.05 & log2FoldChange < 0)

# Different response in genotypes
# Tests if the condition effect is different in WT compared to Q3
interaction <- results(all.dds.DE, name = "ConditionCASP.GenotypeMutant")
interaction <- interaction[order(interaction$padj),]

significant.interaction <- interaction %>%
  as.data.frame %>%
  mutate(ORF = rownames(interaction)) %>%
  filter((log2FoldChange > 1 | log2FoldChange < -1) & padj <= 0.05)

interaction.plotting <- as.data.frame(interaction)

ggplot(interaction.plotting, aes(x = -(log2FoldChange), y = -log10(padj))) + geom_point(size = 2, color = if_else(interaction.plotting$padj < 0.05 & interaction.plotting$log2FoldChange > 1, "darkred", if_else(interaction.plotting$padj < 0.05 & interaction.plotting$log2FoldChange < -1, "red", "grey"))) + theme_bw(base_size = 16) + xlab("Log2(Fold Change)") + ylab("-Log10(Q-Value)") + geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") + coord_cartesian(xlim = c(-4, 4))

## Normalized counts
GLM.counts <- as.data.frame(counts(all.dds.DE, normalize = T))
GLM.counts$ORF <- rownames(GLM.counts)
GLM.counts.names <- GLM.counts
write.csv(GLM.counts.names, "Tra1CASP_RNASeq_NormalizedCounts.csv")

### EVP1 Normalized Counts Plot ###

Count <- counts(all.dds.DE)
Evp1 <- Count["orf19.6741",] 
Evp1.melt <- melt(Evp1)
Evp1.melt$Type <- c(rep("WTND", 3), rep("MutCASP", 3), rep("WTCASP", 3), rep("MutND", 3))
Evp1.melt$Type <- factor(Evp1.melt$Type, levels = c("WTND", "MutND", "WTCASP", "MutCASP"))
ggplot(Evp1.melt, aes(y = value, x = Type)) + geom_jitter(width = 0.15, size = 3) + theme_bw(base_size = 16) + ylab("Normalized Counts") + xlab("") + stat_compare_means(comparisons = testcomparisons, method = "t.test")