############################################################# # # # Graphical display of SNP frequencies in annotated regions # # # ############################################################# # Set up workspace #### rm(list=ls()) # At Institute setwd("I:/ABI/Praktikum Thuenen/Variant detection/ab 180327") # At Home setwd("F:/ABI/Praktikum Thuenen/Variant detection/ab 180327") pacman::p_load(tidyverse,grid,gridExtra,reshape2,ggthemes,svglite) library(reshape) erstelle_Plotdaten <- function(eingabe,coveragefile,fastafile,modifyHeader = F){ # Load a dataset #### daten <- read.table(file = eingabe, header = T, sep = "\t", stringsAsFactors = F) tsv <- read.table(file = coveragefile, header = F, sep = "\t", stringsAsFactors = F) fasta_raw <- scan(file = fastafile, what = character(), sep = "\n") # Tidy up the input dataset #### if(modifyHeader){ daten$Chromosome = gsub(x = daten$Chromosome, pattern = "lcl|KT337313.1_cds_", replacement = "", fixed = T) daten$Gene.number = as.numeric(gsub(x = unlist(strsplit(daten$Chromosome,"_"))[seq(2,length(daten$Chromosome)*2,2)], pattern = " mapping", replacement = "")) daten$Chromosome = unlist(strsplit(daten$Chromosome,"_"))[seq(1,(2*length(daten$Chromosome)),2)] } colnames(daten)[1] <- "Protein.ID" # Determine the number of genes from the fasta file ngenes = 0 for(i in 1:length(fasta_raw)){ if(substr(fasta_raw[i],1,1) == ">"){ ngenes <- ngenes + 1 } } # Parse the fasta file into a data frame containing titles, sequences, and sequence lengths fasta <- data.frame(Title = rep("",times=ngenes), Sequence = "", Number.of.bases = NA, stringsAsFactors = F) j = 0 for(i in 1:length(fasta_raw)){ if(substr(fasta_raw[i],1,1) == ">"){ j <- j + 1 fasta$Title[j] = substr(fasta_raw[i],2,nchar(fasta_raw[i])) k <- 1 while(substr(fasta_raw[i + k],1,1) != ">" && i + k <= length(fasta_raw)){ fasta$Sequence[j] <- paste(fasta$Sequence[j],fasta_raw[i + k],sep="") k <- k + 1 fasta$Number.of.bases[j] <- nchar(fasta$Sequence[j]) if(modifyHeader){ rownames(fasta)[j] <- strsplit(fasta$Title,"_")[[j]][length(strsplit(fasta$Title,"_")[[j]])] }else{ rownames(fasta)[j] <- fasta$Title[j] } } } } laengste <- max(fasta$Number.of.bases) maxcov <- max(tsv$V11) # Create data tables for each gene #### #****************************** #****************************** #*** FILTERING HAPPENS HERE *** #****************************** #****************************** plotdaten <- list() for(i in 1:nrow(fasta)){ temp <- data.frame(Pos = seq(from = 1, to = fasta$Number.of.bases[i], by = 1), Freq = 0, Cov = tsv$V11[which(tsv$V1 == fasta$Title[i] & tsv$V3 == "-")], Count = tsv$V8[which(tsv$V1 == fasta$Title[i] & tsv$V3 == "-")]) for(j in 1:nrow(daten)){ if(rownames(fasta)[i] == daten$Protein.ID[j] && daten$Reference[j] == "C" && (daten$Allele[j] == "T" || daten$Allele[j] == T) && daten$Frequency[j] >= 10 && temp$Cov[daten$Region[j]] >= 10 && temp$Count[daten$Region[j]] >= 3 && daten$Probability[j] >= .95 && grepl("hypothetical_protein_",daten$Protein.ID[j]) == F && grepl("rrn",daten$Protein.ID[j]) == F && grepl("trn",daten$Protein.ID[j]) == F){ temp$Freq[daten$Region[j]] <- daten$Frequency[j] # temp$Cov[daten$Region[j]] <- daten$Coverage[j] } } # if(sum(temp$Freq) > 0){ plotdaten[[rownames(fasta)[i]]] <- temp # } } plotdaten$fasta <- fasta plotdaten$tsv <- tsv plotdaten$laengste <- max(fasta$Number.of.bases) plotdaten$maxcov <- max(tsv$V11) # Change Colnames of the tsv dataframe #### colnames(tsv) <- c("Reference name", "Reference position", "Reference sub-position (insertion)", "Reference symbol", "Number of A's", "Number of C's", "Number of G's", "Number of T's", "Number of N's", "Number of gaps", "Total number of reads covering the position") return(plotdaten) } #********************************* # **** END erstelle_Plotdaten **** #********************************* ####################################### #### #### #### *** START OF MAIN PROGRAM *** #### #### #### ####################################### # eingaben <- c("Bliz153/to CDS collection/alle drei auf CDS collection mapping (Locally Realigned, Variants) potential editing sites.txt", # "W100/to CDS collection/alle drei auf CDS collection mapping (Locally Realigned, Variants) potential editing sites.txt", # "W52 stringent settings/to CDS collection/alle drei auf CDS collection mapping (Locally Realigned, Variants) potential editing sites.txt", # "W52 default settings/to CDS collection/alle drei auf CDSs (Locally Realigned) editing sites low frequency.txt") # eingaben <- c("alltranscripts_allgenotypes/Asp201_all trimmed (paired) mapping_at (Locally Realigned)_at (Variants)_at_lf editing sites.txt", # "alltranscripts_allgenotypes/W52_all mapping_at (Locally Realigned)_at (Variants)_at_lf editing sites.txt", # "alltranscripts_allgenotypes/W100_all mapping_at (Locally Realigned)_at (Variants)_at_lf editing sites.txt", # "alltranscripts_allgenotypes/Bliz153_all mapping_at (Locally Realigned)_at (Variants)_at_lf editing sites.txt") ######################################################################################## #### *** #### #### *** All information needed has to be entered into the next couple of lines *** #### #### #### ######################################################################################## # In the next line, enter the directory name which contains the files with the filtered editing sites verzeichnis <- "four_species_final" eingaben <- dir(verzeichnis) eingaben <- eingaben[which(grepl("editing sites",eingaben))] eingaben <- paste0(verzeichnis,"/",eingaben) # In the current version, the path containing the TSV files with the mapping coverages is generated automatically # from the directory name entered above. If necessary, it can also be set automatically. coverages <- dir(paste0("../../Mapping coverages/",verzeichnis)) coverages <- paste0("../../Mapping coverages/",verzeichnis,"/",coverages) # In the next line, enter the path and name of the FASTA file to which the reads were mapped fastafile <- "../../Reference data/P_tremula_mito_alltranscripts.fasta" # In the next line, enter the path to which the plots will be exported zielverzeichnis <- paste0("Plots/",verzeichnis,"/") # In the next line, enter the desired title of the legends of the graphs legendentitel <- "Species" # The color palette for the colored diagrams is defined here farbpalette <- c(rgb(0,146,146,maxColorValue = 255), rgb(219,209,0,maxColorValue = 255), rgb(182,109,219,maxColorValue = 255), rgb(216,174,22,maxColorValue = 255)) # The output formats are listed here. It is advisable to place the formats that # support only one page at the end, so they are automatically skipped if they # produce an error in the multi-page outputs formate <- c("pdf","png","ps","eps","wmf","svg") for(format in formate){ if(!dir.exists(paste0(zielverzeichnis,format))){ dir.create(paste0(zielverzeichnis,format),recursive = T) } } plotdaten <- list() for(i in 1:length(eingaben)){ datensatz = unlist(strsplit(unlist(strsplit(eingaben[i],"/"))[2]," "))[1] plotdaten[[datensatz]] <- erstelle_Plotdaten(eingaben[i],coverages[i],fastafile,modifyHeader = F) } names(plotdaten) <- gsub("_",".",names(plotdaten)) # We do not want to plot those genes which have no edited base, so we filter them out rausschmeisser <- c() for(i in names(plotdaten[[1]])[1:(length(names(plotdaten[[1]]))-4)]){ kommtweg <- T for(j in names(plotdaten)){ if(max(plotdaten[[j]][[i]]$Freq) > 0){ kommtweg <- F } } if(kommtweg){ rausschmeisser <- c(rausschmeisser, i) } } for(j in names(plotdaten)){ plotdaten[[j]][rausschmeisser] <- NULL } # Simple plots #### # plots = list() # for(i in 1:length(plotdaten)){ # plots[[i]] <- ggplot(data = plotdaten[[i]])+ # geom_bar(aes_string(x = "Pos", y = "Freq"), # stat="identity", # fill = "red")+ # scale_y_continuous(limits = c(0,100), # name = "Editing\nfrequency\n[%]", # breaks = c(0,50,100), # labels = c(0,50,100), # minor_breaks = seq(0,100,10))+ # scale_x_continuous(limits = c(0,laengste), # name = paste(names(plotdaten)[i], # "[base]"), # breaks = seq(0,laengste,500), # labels = seq(0,laengste,500), # minor_breaks = seq(0,laengste,100))+ # # geom_segment(aes(x = 1, # # y = 0, # # xend = fasta$Number.of.bases[i], # # yend = 0), # # colour = "black")+ # labs(x = paste(names(plotdaten)[i], # "[base]"))+ # theme_minimal() # # } #for(i in 1:length(plots)){ # print(plots[[i]]) #} #grid.arrange(grobs = plots[1:length(plots)], nrow = length(plots)) # Nicer plots #### ids <- c() lengths <- c() maxcov <- 0 plotdaten_l <- data.frame(Genname = "Dummy", Base <- 0, stringsAsFactors = F) # Generate the output structure, # determine the longest sequence and the highest coverage, and # make a list of all genes that occur at least once in any of the conditions ### for(datensatz in names(plotdaten)){ plotdaten_l <- cbind(plotdaten_l,c(0),c(0)) colnames(plotdaten_l)[ncol(plotdaten_l)-1] <- paste0(datensatz, "_frequency") colnames(plotdaten_l)[ncol(plotdaten_l)] <- paste0(datensatz, "_coverage") for(id in names(plotdaten[[datensatz]])[1:(length(names(plotdaten[[datensatz]]))-4)]){ if(! id %in% ids){ ids <- c(ids, id) lengths <- c(lengths, plotdaten[[datensatz]]$fasta$Number.of.bases[which(plotdaten[[datensatz]]$fasta$Title == id)]) } } } laengste <- max(lengths) nids <- length(ids) hilfstabelle <- data.frame(id = ids, len = lengths) # fill the output structure with data n <- 1 for(id in ids){ basen <- hilfstabelle$len[which(hilfstabelle$id == id)] balken = winProgressBar(title = paste0(id, " (", n, " of ", nids, ") - ", basen, " bases"), min = 0, max = basen, initial = 0, width = 500) for(base in 1:basen){ setWinProgressBar(balken,base) temp <- c(id, base) for(datensatz in names(plotdaten)){ if(id %in% names(plotdaten[[datensatz]])){ temp <- c(temp, plotdaten[[datensatz]][[id]]$Freq[base], plotdaten[[datensatz]][[id]]$Cov[base]) }else{ temp <- c(temp,0,0) } } plotdaten_l <- rbind(plotdaten_l, temp) } close(balken) n <- n + 1 } plotdaten_l <- plotdaten_l[2:nrow(plotdaten_l),] names(plotdaten_l)[2] <- "Base" for(i in 2:ncol(plotdaten_l)){ plotdaten_l[,i] <- as.numeric(plotdaten_l[,i]) } plotdaten_l$Genname <- gsub("_"," ",plotdaten_l$Genname) plotdaten_l$Genname <- gsub("hypothetical protein","hyp.",plotdaten_l$Genname) # scale coverages to the 90th percentile coveragedaten <- which(grepl("_coverage",names(plotdaten_l))) maxcov <- quantile(melt(plotdaten_l[coveragedaten])[,2],probs = .9) plotdaten_long <- plotdaten_l #plotdaten_long[coveragedaten] <- plotdaten_l[coveragedaten]*100/maxcov # Set of single-color diagrams, one line per gene, all genes on one page #### diagramme <- list() for(i in which(grepl("_frequency",names(plotdaten_long)))){ diagrammname <- gsub("_frequency","", gsub("\\.",". ",colnames(plotdaten_long)[i])) diagramme[[diagrammname]] <- ggplot(plotdaten_long)+ geom_line(aes_string(x = "Base", y = colnames(plotdaten_long)[i+1]), colour = "blue", linetype = "solid", size = .1, alpha = .25)+ geom_bar(aes_string(x = "Base", y = colnames(plotdaten_long)[i]), stat = "identity", fill = "red")+ scale_y_continuous(limits = c(0,100), name = "Editing frequency [%] / Coverage", breaks = c(0,50,100), labels = c(0,50,100), minor_breaks = seq(0,100,10))+ scale_x_continuous(limits = c(0,laengste), breaks = seq(0,laengste,500), labels = seq(0,laengste,500), minor_breaks = seq(0,laengste,100))+ theme_light(base_size = 9)+ ggtitle(label = paste0("Mitochondrial RNA editing sites in ", gsub("_",". ",diagrammname)))+ theme(plot.title = element_text(hjust = 0.5))+ facet_grid(Genname ~ .)+ theme(strip.text.y = element_text(angle = 0)) } for(diagramm in names(diagramme)){ for(format in formate){ print(paste0("Saving single-color diagram for ",diagramm," as ",format,"...")) ggsave(filename = paste0(zielverzeichnis, format, "/Mitochondrial RNA editing sites in ", diagramm, " portrait.", format), plot = diagramme[[diagramm]], device = format, width = 210, height = 297, units = "mm") ggsave(filename = paste0(zielverzeichnis, format, "/Mitochondrial RNA editing sites in ", gsub("\\."," ",diagramm), " landscape.", format), plot = diagramme[[diagramm]], device = format, width = 297, height = 210, units = "mm") } } for(format in formate[1:4]){ print(paste0("Saving single-color diagrams as ",format)) ggsave(filename = paste0(zielverzeichnis, format, "/Mitochondrial editing sites.", format), marrangeGrob(grobs = diagramme, ncol = 1, nrow = 1), device = format, width = 210, height = 297, units = "mm") } # Multicolor diagram, one line per gene, all genes on one page #### plotdaten_long_m <- melt.data.frame(data = plotdaten_long, id.vars = c(1,2), measure.vars = c(3:ncol(plotdaten_long))) covs <- plotdaten_long_m$value[grepl("_coverage",plotdaten_long_m$variable)] plotdaten_long_m <- plotdaten_long_m[grepl("_coverage",plotdaten_long_m$variable) == F,] plotdaten_long_m$covs <- covs plotdaten_long_m$variable <- as.factor(gsub(pattern = "_frequency", replacement = "", x = plotdaten_long_m$variable)) diagramm_bunt <- ggplot(data = plotdaten_long_m)+ geom_line(aes(x = Base, y = covs, color = variable), stat = "identity", linetype = "solid", alpha = .25)+ geom_bar(aes(x = Base, y = value, fill = variable), stat = "identity", position = "dodge")+ scale_y_continuous(limits = c(0,100), name = "Editing frequency [%] / Coverage", breaks = c(0,50,100), labels = c(0,50,100), minor_breaks = seq(0,100,10))+ scale_x_continuous(limits = c(0,laengste), breaks = seq(0,laengste,500), labels = seq(0,laengste,500), minor_breaks = seq(0,laengste,100))+ scale_color_discrete(name = legendentitel, breaks = gsub("\\.", ". ",names(plotdaten)), labels = gsub("\\.", ". ",names(plotdaten)))+ guides(color = guide_legend(legendentitel))+ theme_light(base_size = 9)+ guides(color = F) diagramm_final <- diagramm_bunt+ facet_grid(Genname ~ .)+ theme(strip.text.y = element_text(angle = 0), legend.position = c(0.82,0.82)) for(format in formate){ print(paste0("Saving multi-color diagram as ",format)) ggsave(filename = paste0(zielverzeichnis, format, "/Mitochondrial editing sites colored portrait.", format), plot = diagramm_final, device = format, width = 210, height = 297, units = "mm") ggsave(filename = paste0(zielverzeichnis, format, "/Mitochondrial editing sites colored landscape.", format), plot = diagramm_final, device = format, width = 297, height = 210, units = "mm") } # Broken colored diagrams, 100 bp per line, one gene per page #### gebrochene_plotdaten <- list() for(genname in unique(plotdaten_long_m$Genname)){ gebrochene_plotdaten[[genname]] <- data.frame(Segment = as.factor(character(0)), Base = numeric(0), Genotype = as.factor(character(0)), value = numeric(0), covs = numeric(0), stringsAsFactors = F) } segmentgroesse <- 100 j <- 1 segment = c(j * segmentgroesse - (segmentgroesse-1), j * segmentgroesse) balken = winProgressBar(title = "Breaking up dataset into gene-specific plot data...", min = 0, max = nrow(plotdaten_long_m), width = 500) for(i in 1:nrow(plotdaten_long_m)){ setWinProgressBar(balken, i) gebrochene_plotdaten[[plotdaten_long_m$Genname[i]]] <- rbind(gebrochene_plotdaten[[plotdaten_long_m$Genname[i]]], data.frame(Segment = as.factor(paste0(segment[1], "..", segment[2])), Base = j, Species = as.factor(gsub("\\.",". ",plotdaten_long_m$variable[i])), value = plotdaten_long_m$value[i], covs = plotdaten_long_m$covs[i], stringsAsFactors = FALSE)) j <- j + 1 if(j > segmentgroesse){ j <- 1 segment <- segment + segmentgroesse } if(!is.na(plotdaten_long_m$Genname[i+1])){ if(plotdaten_long_m$Genname[i+1] != plotdaten_long_m$Genname[i]){ j <- 1 segment <- c(j * segmentgroesse - (segmentgroesse-1), j * segmentgroesse) } } } close(balken) bunte_diagramme <- list() for(datensatz in sort(names(gebrochene_plotdaten))){ bunte_diagramme[[datensatz]] <- ggplot(data = gebrochene_plotdaten[[datensatz]])+ geom_line(aes(x = Base, y = covs, color = Species), linetype = "solid", alpha = .25)+ geom_bar(aes(x = Base, y = value, fill = Species), stat = "identity", position = "dodge")+ scale_y_continuous(limits = c(0,100), name = "Editing frequency [%] / Coverage", breaks = c(0,50,100), labels = c(0,50,100), minor_breaks = seq(0,100,10))+ scale_x_continuous(limits = c(0,segmentgroesse), breaks = seq(0,segmentgroesse,10), labels = seq(0,segmentgroesse,10), minor_breaks = seq(0,laengste,2))+ theme_light(base_size = 9)+ facet_grid(Segment ~ .)+ theme(strip.text.y = element_text(angle = 0), legend.position = "bottom", plot.title = element_text(hjust = 0.5))+ ggtitle(label = paste0("RNA editing sites in ", datensatz)) } for(format in formate[1:4]){ print(paste0("Saving broken colored diagrams as ",format," in portrait orientation...")) ggsave(filename = paste0(zielverzeichnis, format, "/Mitochondrial editing sites per gene portrait.",format), marrangeGrob(grobs = bunte_diagramme, ncol = 1, nrow = 1, top = NULL, bottom = quote(paste("Page",g))), device = format, width = 210, height = 297, units = "mm") } bunte_diagramme_quer <- list() for(datensatz in sort(names(gebrochene_plotdaten))){ bunte_diagramme_quer[[datensatz]] <- bunte_diagramme[[datensatz]]+ theme(legend.position = "right") } for(format in formate[1:4]){ print(paste0("Saving broken colored diagrams as ",format," in landscape orientation...")) ggsave(filename = paste0(zielverzeichnis, format, "/Mitochondrial editing sites per gene landscape.",format), marrangeGrob(grobs = bunte_diagramme_quer, ncol = 1, nrow = 1, top = NULL, bottom = quote(paste("Page",g))), device = format, width = 297, height = 210, units = "mm") } for(diagramm in names(bunte_diagramme)){ for(format in formate){ print(paste0("Saving broken multicolor diagram for ",diagramm," as ",format,"...")) ggsave(filename = paste0(zielverzeichnis, format, "/Editing sites in ", diagramm, ".", format), plot = bunte_diagramme[[diagramm]], device = format, width = 297, height = 210, units = "mm") } }