########################################################################## ############################ Main Function ############################### ########################################################################## #Top level processing script #marker_file-Tab delmited table with counts of reads aligned to RADseq cut sites, # first column is strain name, first row is marker chromosome, # second row is marker (restriction site) position within chromosome # Assumes markers are sorted by chromosome. #chromosome_file - Tab delimited table with columns: strain name, chr (mt, 1-7, R,total reads) # values for chromosome columns are the proportion of all reads in that strain aligned # to that chromosome. Used to infer relative chromosome copy numbers #non_diploid_file - Script assumes euploids are diploid, this file allows this to be overriden, # tab delimted two columns: strain name and median chromosome copy number #offset_file - Converts by-chromosome marker numbering to genome-wide numbering, # two columns:chromosome name and offset to convert within-chromosome to genome-wide marker positioning #coverage_output_dir - Output directory for coverage plots without allele ratios #log - logfile path #logfile - logfile name #het_dir - Output directory for coverage plots with allele ratios #data_dir - Output directory for data files #p1_table - Table of counts of "A" alleles at het sites for all strains. First column=markernames (concatenated chromosome and position) # second column=allele (i.e. base), remaining columns= strains, first row=header ("Allele", strain names) #p2_table - Table of counts of "B" alleles at het sites for all strains. First column=markernames (concatenated chromosome and position) # second column=allele (i.e. base), remaining columns= strains, first row=header ("Allele", strain names) #seg_flag - Flag for whether segmental CNV should be activated for allele ratios #segmental_file - Identifies segmental patterns, tab-delimited file, first column # is strain, second is chromosome and third is left to right description of segment copy numbers # e.g. 12 = copy number one for left end of chromosome and copy number 2 for right end. Maximum of 3 segments. process_yseq=function(marker_file,chromosome_file,non_diploid_file,offset_file, coverage_output_dir, log, logfile,het_dir,data_dir,p1_table,p2_table,segmental_file,seg_flag){ #Input offsets file for converting from by-chromosome to whole-genome indexing offsets=read.table(offset_file, sep="\t", stringsAsFactors=FALSE) #Input counts as list. #First element is a matrix of the marker counts with rowname being the strains, and no colnames (marker names) as these are held in the next element. #Second element is vector of chromosome component of the marker positions #Third element is vector of coverage marker positions within chromosomes (bp) count_list=input_counts(marker_file) #Ensure markers are sorted numerically within chromosomes new_order=order(count_list[[2]],as.numeric(count_list[[3]])) #sort marker numerically within chromosomes count_list[[1]]=count_list[[1]][,new_order] count_list[[2]]=count_list[[2]][new_order] count_list[[3]]=count_list[[3]][new_order] #Visualize proportion of missing values by strain and column to assess cutoffs #for filtering assess_cutoffs(count_list) #Do strain and marker trimming, trim_rows: edits only (rows) of counts matrix in count_list[[1]] #trim_cols: trims columns in counts matrix in count_list[[1]] and marker names in count_list[[2]] and count_list[[3]] count_list=trim_rows(count_list,0.05) #strain must have at least 5% of markers called count_list=trim_cols(count_list,0.5) #marker must be called in at least 50% of (remaining) strains #Specify which strains are to be used as the normalization set #looking for name pattern match (identifies row numbers in the counts matrix) euploid_strain_indices=c(grep("9318",rownames(count_list[[1]])),grep("AF7",rownames(count_list[[1]]))) #Remove any columns that have 0 counts in more than 20% of euploid_strains count_list=trim_on_euploids(count_list,euploid_strain_indices,0.8) #Normalize the counts in each strain to make the median strain value equal to 1 (ignoring 0 counts) #Hold normalized ratios in counts list element 4 count_list=median_strain_norm(count_list) #Convert to whole genome numbering #Hold in counts list element 5 count_list=add_whole_genome_numbering(count_list,offsets) #For each marker, find median ratio among euploids, hold as counts list element 6. #This allows normalizing for marker-specific effects on coverage for all other strains. count_list=add_euploid_medians(count_list,euploid_strain_indices) #Assess noisiness (SD) of each marker after euploid normalization as counts list element 7 count_list=add_noise_assess(count_list,euploid_strain_indices) #Plot normalized coverage by marker (with noisiness (SD) cutoff of 0.5) plot_coverage(coverage_output_dir,count_list,0.5) #Estimate chromosome copy numbers for each strain from proportion of aligned reads that are aligned to each chromosome #Note assumes the median chromosome copy number for each strain is 2 (so euploids would be diploid), #exceptions must be specified in the non_diploid_file all_ploidy=normalize_chromosome_file(chromosome_file,non_diploid_file) ploidy=t(all_ploidy[[5]]) #Assign chromosome names rownames(ploidy)=c("Ca21chr1_C_albicans_SC5314","Ca21chr2_C_albicans_SC5314","Ca21chr3_C_albicans_SC5314","Ca21chr4_C_albicans_SC5314","Ca21chr5_C_albicans_SC5314","Ca21chr6_C_albicans_SC5314","Ca21chr7_C_albicans_SC5314", "Ca21chrR_C_albicans_SC5314") #Input tables of allele counts (reads matching "A" allele and reads matching "B" allele at each SNP site) for each strain allele_count_list=count_alleles(rownames(count_list[[1]]),p1_table,p2_table) #Convert allele table marker names into a table with two columns, first is chromosome on which marker is found and second is position (bp) of marker #Then sort table first on chromosome and then on marker position, sort allele count table in same way marker_table=t(sapply(rownames(allele_count_list[[1]]),function(x){y=strsplit(x,"SC5314_")[[1]];y[1]=paste(y[1],"SC5314",sep="");y})) new_order=order(marker_table[,1],as.numeric(marker_table[,2])) allele_count_list[[1]]=allele_count_list[[1]][new_order,] allele_count_list[[2]]=allele_count_list[[2]][new_order,] marker_table=marker_table[new_order,] #Calculate model probabilities based on observed allele counts #This is done for all models (1A:0B, 3A:1B,2A:1B,1A:1B,1A:2B,1A:3B,0A:1B) #Note that counts of zero will result in log p-values of zero under all models model_prob_array=calc_model_probs(allele_count_list) #Calculate the best model across all models #and the best model based on 1,2,3 or 4 copies of chromosomes best_model=calc_best_model(model_prob_array) #Calculate the score of the best model across all possible models #and the score of the best model based on 1, 2,3 or 4 copies of chromosomes lod_scores=calc_best_score(model_prob_array) #Expand ploidy table to match number of markers in model and score tables #i.e. specify for each marker the copy number of the chromosome on which it is found. #At this point assumes no segmental copy number variation, only whole chromosome. #Segmentals can be specified later ploidy_table=create_ploidy_table(best_model[,,1],ploidy) #Method not designed for copy numbers better than 4, treat these all as a single class of copy number "5" (see below) ploidy_table[ploidy_table>4]=5 #Override ploidy for segmentals based on user-supplied known segmental transition patterns if (seg_flag==1){ ploidy_table=process_segmentals(segmental_file,count_list,ploidy,ploidy_table,marker_table) } #Extract best allele ratio model and corresponding lod score for each strain/marker constrained #by copy number (identical for all markers on the same chromosome except for segmentals specified above) #Note that the placeholder "5" copy number is treated as unconstrained - best of all models is used final_model_and_score=extract_final_model_and_score(best_model,lod_scores,ploidy_table) #Convert marker positions into whole-genome-positions wgs_table=extract_chrom_and_wg_pos(final_model_and_score, offsets) #Plot allele ratios along with coverage plots with whole-genome scaling #with colors indicating allele ratio (copy number "5" treated differently from others) plot_coverage_with_het(het_dir,count_list,0.5,wgs_table,final_model_and_score,ploidy_table) #Output data rownames(ploidy_table)=rownames(lod_scores) save(allele_count_list,file=paste(data_dir,"/allele_count_list_r",sep="")) save(model_prob_array,file=paste(data_dir,"/model_prob_array_r",sep="")) save(best_model,file=paste(data_dir,"/best_model_r",sep="")) save(lod_scores,file=paste(data_dir,"/lod_scores_r",sep="")) save(ploidy_table,file=paste(data_dir,"/ploidy_table_r",sep="")) save(final_model_and_score,file=paste(data_dir,"/final_model_and_score_r",sep="")) #Output ploidy table as text file write.table(ploidy_table,file=paste(data_dir,"/ploidy_table.txt",sep=""),sep="\t",quote=F) #Output log file write.table(log,file=logfile,col.names=F,row.names=F,quote=F) } ########################################################################## ######################### Accessory Functions ############################ ########################################################################## #Replaces the ploidy calls in ploidy_table (one for each marker) for #segmental chromosomes. Normally,ploidy calls are the same for all markers #on a chromosome, for the segmentals these are replaced with a ploidy call #for each segment, with edges assessed from marker coverage. The ordered #set of segmental copy numbers are provided by the user (e.g. after manual #examination of the coverage plots) in table format, with strain and #chromosome as the first two columns. The third column is a number #describing the (left to right) segmental copy numbers, e.g. "12" means #two segments, with the left end of the chromosome having copy number 1 #and the right, copy number 2. The positions of the transitions between the #segments are estimated by maximum likelihood based on the normalized coverage #counts for the RADseq markers assuming normal distribution with means reflecting #segment copy number #Can deal with 3 segments at most. # #Returns edited ploidy_table # #Arguments: segmental chromosome file, counts list,chromosome ploidy (ploidy), #allele marker ploidy table (ploidy_table), allele count list process_segmentals=function(segmental_file,count_list,ploidy,ploidy_table,marker_table){ #Read table defining segmental transitions seg_table=read.table(segmental_file, sep="\t",stringsAsFactors=F) #Process segmental chromosomes one at a time, replacing corresponding ploidy #in ploidy_table with new estimates for (i in 1:length(seg_table[,1])){ #Extract current strain and chromosome and transition pattern current_strain=seg_table[i,1] current_chrom=seg_table[i,2] transitions=as.numeric(strsplit(as.character(seg_table[i,3]),"")[[1]]) #If segmental strain not present in count list move on to next strain if(!(current_strain %in% rownames(count_list[[4]]))){print (c("Missing strain ", current_strain));next} #Identify the median coverage ratio for the current strain and #the copy number that this reflects. Convert euploid-normlized #coverage ratios (count_list[[4]] for this strain divided by #euploid medians in count_list[[6]]) to ploidies by multiplying #ratios (gobal median==1) by median ploidy for strain #Take log2 of ratios as expect them to be log normal count_ratios=count_list[[4]][current_strain,]/count_list[[6]] median_ploidy=median(ploidy[,current_strain]) count_ratios=log(count_ratios*median_ploidy,2) #Remove ratio 0 values (log -Inf) from ratios and marker chromosome and position IDs filter=!(count_ratios== -Inf) all_chroms=count_list[[2]][filter] all_pos=count_list[[3]][filter] count_ratios=count_ratios[filter] #Extract just chromosome of current interest and extract number and position of markers filter=all_chroms==current_chrom current_counts=count_ratios[filter] no_pos=length(current_counts) #number of markers current_pos=all_pos[filter] #marker positions #Expect count ratios to be approximately log normal around local copy number #Create log p-value matrix based on the probability density of each observed #normalized count ratio (pointwise ploidy estimate) under normal distributions with #means given by each segmental ploidy. Count ratios already log transformed before calculation but #log transform the segmental specified copy numbers (expected means of the normal distributions) here. #Note uses an estimate of the the SD of all log ratios from this strain, using an 11 marker sliding window #and taking the median value #Returned matrix has marker positions as cols and rows as the ordered ploidies #of the transition vector (top to bottom for segments left to right) with log p-values for each marker #under each model. Later, combine these at different transition points to get global likelihood estimates global_sd=median(sapply(1:(length(count_ratios)-10),function(x){y=count_ratios[x:(x+10)];sd(y)})) p_mat=t(sapply(transitions,function(x){(dnorm(current_counts,log(x,2),global_sd,log=T))})) #Find the transition marker index/indices between segments that maximize(s) the data likelihood #for the normalized count ratios (pointwise plody estimates) best_transition_points=c() if (length(transitions)==2){ trans_score_vect=sapply(1:(no_pos-1),function(x){ first_end=x second_start=x+1 sum(p_mat[1,1:first_end],p_mat[2,second_start:(no_pos)]) }) best_transition_points=which(trans_score_vect==max(trans_score_vect)) } else { #For 2 transitions trans_score_mat=matrix(nrow=no_pos,ncol=no_pos) if (length(transitions)==3){ for (x in 1:(no_pos-2)){ first_end=x second_start=x+1 for (y in x:(no_pos-1)){ second_end=y third_start=y+1 trans_score_mat[x,y]=sum(p_mat[1,1:first_end],p_mat[2,second_start:second_end],p_mat[3,third_start:no_pos]) } } best_transition_points=as.numeric(which(trans_score_mat==max(trans_score_mat,na.rm=T),arr.ind=T)[1,]) } } #Add left end position (0) to transition points to make it same length as segment state vector (transitions) #i.e. one transition means two segments (0-transition point, transition point-end) best_transition_points=current_pos[best_transition_points] best_transition_points=c(0,best_transition_points) #Get vector of ploidies and of positions for all allele markers on this chromosome, for this strain #Note that these are the allele markers NOT coverage markers and coverage ratios as have been using up to now current_ploidies=ploidy_table[rownames(ploidy_table)==current_chrom,current_strain] current_marker_pos=as.numeric(marker_table[,2])[marker_table[,1]==current_chrom] #Replace marker ploidies based on what segment of the chromosome they fall in based on the transition positions for (i in 1:length(transitions)){ current_ploidies[current_marker_pos>best_transition_points[i]]=transitions[i] } #Replace ploidies into full ploidy table for this chromosome and this strain ploidy_table[rownames(ploidy_table)==current_chrom,current_strain]=current_ploidies } return (ploidy_table) } #For each strain, plot normalized coverage ratios and smoothed #allele ratios. # #Arguments: output directory for images, counts list, threshold for #removing noisy markers based on their SD in euploid strains, marker #table (first column=chrom, second column= genome-wide position), #final model and score list (two elements, first is best model given #ploidy constraint for marker, second is LOD vs second best model), #ploidy table, giving estimated copy number of each allele marker #(from whole chromosome, or segmental copy number) plot_coverage_with_het=function(outdir,counts, noisy_threshold,wgs_table,final_model_and_score,ploidy_table){ #Define colors for alternating chromosomes and for alleles based on best model (models 1:7 are: HomA, 3A:1B,2A:1B,1A:1B,1A:2B,1A:3B,HomB) customcolors=rep(c("black", "grey70"), 20) silly_cols=as.numeric(as.factor(counts[[2]])) #Use different color schemes for the 7 allele ratios for each copy numbers. Note that many of these are placeholders #as models imposible for that coverage (e.g. 3:1 ratio when copy number =3) #also note that copy number "5" (any copy number >4) uses all potential models but colors them #to indicate HomA, A>B, A=B, A10 & counts[[7]]0){ png(out,width = 960, height = 480) plot(as.numeric(counts[[5]])[filter],current_norm[filter],pch=19,cex=0.5,log="y",ylim=c(0.2,5), col=customcolors[silly_cols[filter]],main=rownames(counts[[1]])[i], ylab="Normalized Coverage",xlab="Genomic Position") filter2=final_model_and_score[[1]][,strain]>0 & final_model_and_score[[2]][,strain]>log(2,10) #Then add the median-smoothed (window=7) allele ratio call identified by color allele_nums=runmed(as.numeric(final_model_and_score[[1]][filter2,strain]),7) ploidy_nums=runmed(as.numeric(ploidy_table[filter2,strain]),7) points(as.numeric(wgs_table[filter2,2]),jitter(rep(4,length(wgs_table[filter2,2])),amount=0.4),cex=0.6,col=allele_cols[cbind(allele_nums,ploidy_nums)]) dev.off() } } } #Takes an initial ploidy table giving copy number of each chromosome (rows=strains, cols=chromosomes) #and a matrix whose rownames are strains and colnames are markers. # #Returns a matrix with the same #dimensions as the input matrix, but giving the copy number of each marker in each strain set to equal #the copy number of the appropriate chromosome in that strain # #Arguments: Template strain and marker matrix, initial ploidy table create_ploidy_table=function(initial_table,ploidy){ #Initialize output matrix ploidy_table=initial_table*0 #Extract chromosome vector from markernames of inital matrix chroms=as.character(sapply(rownames(initial_table),function(x){ paste(strsplit(x,"SC5314_")[[1]][1],"SC5314",sep="") })) #Fill in marker copy number from corresponding chromosome copy number, default to 0 as error ploidy_table=sapply(chroms,function(x){ sapply(colnames(initial_table),function(y){ cov=0 if((x %in% rownames(ploidy)) & (y %in% colnames(ploidy))){cov=ploidy[x,y]} cov }) }) return (t(ploidy_table)) } #Extracts chromosome and marker position from concatenated unique markernames #present as rownames in object provided as argument. Markernames are converted from #chromosome-based to whole-genome-based using offsets table (first column is chrom name, second #is offset to give whole genome position, for markers on that chromosome). # #Returns table with first column = chromosome and second column = whole genome marker position # #Arguments: Table with concatenated marker names as row names, extract_chrom_and_wg_pos=function(final_model_and_score, offsets){ #Split concatenated marker name into chromosome and within-chromosome position as 2 columns marker_info=t(sapply(rownames(final_model_and_score[[1]]),function(x){ y=strsplit(x,"SC5314_")[[1]] y[1]=paste(y[1],"SC5314",sep="") y })) #Renumber position using offsets so that is genome-wide for (m in 1:length(offsets[,1])){ current_chrom=offsets[m,1] current_offset=as.numeric(offsets[m,2]) marker_info[marker_info[,1]==current_chrom,2]=as.numeric(marker_info[marker_info[,1]==current_chrom,2])+current_offset } return(marker_info) } #Extracts best allele model and LOD vs second best model for each marker in each strain based on #constraint of copy number (1-4) of the marker (previously derived from chromosome or segment copy number) #Uses LOD scores and best model scores - held in arrays with rows=strains, cols=markers and 5 levels #1=haploids/monosomes, 2=disomes/diploids, 3=trisomes/triploids, 4=tetrasomes/tetraploids, 5=unconstrained #Fifth level is used for error ploidies (>4) # #Returns list with first element giving best model (1:7, HomA,3A:1B, 2A:1B, 1A:1B,1A:2B,1A:3B,HomB) for each marker #given ploidy constraint and second element giving LOD best model vs second best model for each marker. #Both elements are matrices with strains=rows, cols=markers # #Arguments:best model array (first dimension=strains, second=markers, third is ploidy (1:5), entries are best model constrained by marker copy number), #LOD array - same structure as model array, but holds LOD score of best model vs second best model (constrained by marker copy number), #ploidy table (rows=strains,cols=markers, values=copy number of that marker (i.e. chromosome)) extract_final_model_and_score=function(best_model,lod_scores,ploidy_table){ out_list=list() #For each copy number constraint, populate the best model from the best model array for markers with that copy number final_model=best_model[,,5] #For error ploidy (not 1-4) give best overall model final_model[ploidy_table==1]=best_model[,,1][ploidy_table==1] final_model[ploidy_table==2]=best_model[,,2][ploidy_table==2] final_model[ploidy_table==3]=best_model[,,3][ploidy_table==3] final_model[ploidy_table==4]=best_model[,,4][ploidy_table==4] #For each copy number constraint, populate the LOD of the best model from the LOD array final_score=lod_scores[,,5] #For error ploidy (not 1-4) give LOD of best overall model vs second best overall model final_score[ploidy_table==1]=lod_scores[,,1][ploidy_table==1] final_score[ploidy_table==2]=lod_scores[,,2][ploidy_table==2] final_score[ploidy_table==3]=lod_scores[,,3][ploidy_table==3] final_score[ploidy_table==4]=lod_scores[,,4][ploidy_table==4] out_list[[1]]=final_model out_list[[2]]=final_score return (out_list) } #Calculates the LOD score of the best model versus the second best model (e.g. homA vs 1A:1B) for each marker in each strain under 5 copy number constraints: #N=1 - compare HomA, HomB #N=2 - compare HomA, 1A:1B, HomB #N=3 - compare HomA, 2A:1B, 1A:2B, HomB #N=4 - compare HomA, 3A:1B, 2A:2B (1A:1B), 1A:3B, HomB #N="5" - Unconstrained - best model across all models -HomA, 3A:1B, 2A:1B, 1A:1B, 1A:2B,1A:3B, HomB # #Returns array with rows=strains, cols=markers and 5 levels, corresponding to the copy number models above in order 1:5. #Each entry is the LOD best vs second best for that ploidy constraint. # #Arguments:model probability array (log probabilities from model probability array (rows=strains, cols=markers, 7 levels one for each model: HomA, 3A:1B, 2A:1B, 1A:1B, 1A:2B,1A:3B, HomB)). #Note that these are binomial probabilities without the permutation term as permutation term is the same under all models and cancel when LOD calculated. calc_best_score=function(model_prob_array){ out_array=array(data = 0, dim = c(length(model_prob_array[,1,1]),length(model_prob_array[1,,1]),5),dimnames=list(rownames(model_prob_array),colnames(model_prob_array),c(1:5))) #Calculate LOD of best model vs second best model across all models (N="5") out_array[,,5]=sapply(1:length(model_prob_array[1,,1]),function(x){ apply(model_prob_array[,x,],1,function(y){ z=y[order(as.numeric(y),decreasing=TRUE)] z[1]-z[2] }) }) #For only models consistent with N=1 out_array[,,1]=sapply(1:length(model_prob_array[1,,1]),function(x){ apply(model_prob_array[,x,c(1,7)],1,function(y){ z=y[order(as.numeric(y),decreasing=TRUE)] z[1]-z[2] }) }) #For N=2 out_array[,,2]=sapply(1:length(model_prob_array[1,,1]),function(x){ apply(model_prob_array[,x,c(1,4,7)],1,function(y){ z=y[order(as.numeric(y),decreasing=TRUE)] z[1]-z[2] }) }) #For N=3 out_array[,,3]=sapply(1:length(model_prob_array[1,,1]),function(x){ apply(model_prob_array[,x,c(1,3,5,7)],1,function(y){ z=y[order(as.numeric(y),decreasing=TRUE)] z[1]-z[2] }) }) #For N=4 out_array[,,4]=sapply(1:length(model_prob_array[1,,1]),function(x){ apply(model_prob_array[,x,c(1,2,4,6,7)],1,function(y){ z=y[order(as.numeric(y),decreasing=TRUE)] z[1]-z[2] }) }) return (out_array) } #Identifies the best model (e.g. homA or 1A:1B) for each marker in each strain under 5 copy number constraints: #N=1 - compare HomA, HomB #N=2 - compare HomA, 1A:1B, HomB #N=3 - compare HomA, 2A:1B, 1A:2B, HomB #N=4 - compare HomA, 3A:1B, 2A:2B (1A:1B), 1A:3B, HomB #N="5" - Unconstrained - best model across all models -HomA, 3A:1B, 2A:1B, 1A:1B, 1A:2B,1A:3B, HomB # #Returns array with rows=strains, cols=markers and 5 levels, corresponding to the copy number models above in order 1:5. #Each entry is the best for that ploidy constraint (1:7 = HomA, 3A:1B, 2A:1B, 1A:1B, 1A:2B,1A:3B, HomB). # #Arguments:model probability array (log probabilities from model probability array (rows=strains, cols=markers, 7 levels one for each model: HomA, 3A:1B, 2A:1B, 1A:1B, 1A:2B,1A:3B, HomB). #Note that these are binomial probabilities without the permutation term as permutation term is the same under all models and cancel when LOD calculated. calc_best_model=function(model_prob_array){ out_array=array(data = 0, dim = c(length(model_prob_array[,1,1]),length(model_prob_array[1,,1]),5),dimnames=list(rownames(model_prob_array),colnames(model_prob_array),c(1:5))) #Find best model across all models print ("Testing All Models") out_array[,,5]=sapply(1:length(model_prob_array[1,,1]),function(x){ apply(model_prob_array[,x,],1,function(y){ c(1:7)[order(as.numeric(y),decreasing=TRUE)][1] }) }) #For only models consistent with N=1 print ("Testing N=1") out_array[,,1]=sapply(1:length(model_prob_array[1,,1]),function(x){ apply(model_prob_array[,x,c(1,7)],1,function(y){ c(1,7)[order(as.numeric(y),decreasing=TRUE)][1] }) }) #For only models consistent with N=2 print ("Testing N=2") out_array[,,2]=sapply(1:length(model_prob_array[1,,1]),function(x){ apply(model_prob_array[,x,c(1,4,7)],1,function(y){ c(1,4,7)[order(as.numeric(y),decreasing=TRUE)][1] }) }) #For only models consistent with N=3 print ("Testing N=3") out_array[,,3]=sapply(1:length(model_prob_array[1,,1]),function(x){ apply(model_prob_array[,x,c(1,3,5,7)],1,function(y){ c(1,3,5,7)[order(as.numeric(y),decreasing=TRUE)][1] }) }) #For only models consistent with N=4 print ("Testing N=4") out_array[,,4]=sapply(1:length(model_prob_array[1,,1]),function(x){ apply(model_prob_array[,x,c(1,2,4,6,7)],1,function(y){ c(1,2,4,6,7)[order(as.numeric(y),decreasing=TRUE)][1] }) }) return (out_array) } #Calculates the probability of the observed a and b allele frequencies at each strain/marker #under 7 different haplotype models: #homozygous A (B error at 0.01), 3A:1B, 2A:1B,1A:1B (or 2A:2B), 1A:2B, 1A:3B, hom B (A error at 0.01) # #Returns array, rows=strains, cols=markers and 3rd dimension has 7 levels each corresponding to one of the models above in order 1:7. #Each entry is the observed binomimal probability of the observed allele counts in that strain, for that marker, using the current model. #NOTE THAT PERMUTATION COMPONENT OF PROBABILITY HAS BEEN DROPPED AS CANCEL OUT IN CALCULATING LOD #SCORES BETWEEN MODELS AT LATER STEP # #Arguments: Allele count list (first element is A counts, second is B counts, counts in table, rows are markers, first column is allele name (A T G or C), rest are strains with values A or B allele counts at that marker in that strain) calc_model_probs=function(allele_count_list){ #First get the allele counts (drop the vector of alleles from the first column) a_counts=allele_count_list[[1]][,-1] b_counts=allele_count_list[[2]][,-1] #Create an array with 7 levels (one for each model) and each level a matrix with rows=strains and cols=markers #each entry is the observed bionimal probability of the observed allele counts in that strain, for that marker #, using the current model. Note that the permutation term has been removed from the p-value as these cancel #out when calculating LOD scores between models model_calcs=array(data = 0, dim = c(length(a_counts[,1]),length(a_counts[1,]),7),dimnames=list(rownames(a_counts),colnames(a_counts),c(1:7))) model_calcs[,,1]=as.matrix(log(0.99,10)*a_counts+log(0.01,10)*b_counts) model_calcs[,,2]=as.matrix(log(3/4,10)*a_counts+log(1/4,10)*b_counts) model_calcs[,,3]=as.matrix(log(2/3,10)*a_counts+log(1/3,10)*b_counts) model_calcs[,,4]=as.matrix(log(0.5,10)*a_counts+log(0.5,10)*b_counts) model_calcs[,,5]=as.matrix(log(1/3,10)*a_counts+log(2/3,10)*b_counts) model_calcs[,,6]=as.matrix(log(1/4,10)*a_counts+log(3/4,10)*b_counts) model_calcs[,,7]=as.matrix(log(0.01,10)*a_counts+log(0.99,10)*b_counts) return(model_calcs) } #Creates allele count list with element 1 being the counts of a alleles at each observed marker in all #strains, and the 2nd element being the b count. Columns after first are strain names, rows are markers with markernames #as row names. First column is the allele (a or b as appropriate, with col name "Allele", values will be A, T, G or C) # #Returns allele count list # #Arguments: strain names and "A" and "B" allele count tables count_alleles=function(labels,p1_file,p2_file){ out_list=list() marker_names=c() #Vector of all marker names across all strains (no duplicate markers) #Input the allele call files p1_table=read.table(p1_file, sep="\t", stringsAsFactors=FALSE,check.names=F,header=T) p2_table=read.table(p2_file, sep="\t", stringsAsFactors=FALSE,check.names=F,header=T) #Set markernames as rownames rownames(p1_table)=p1_table[,1] rownames(p2_table)=p2_table[,1] p1_table=p1_table[,-1] p2_table=p2_table[,-1] reverse_filter=labels %in% colnames(p1_table) p1_table=p1_table[,c("Allele",labels[reverse_filter])] p2_table=p2_table[,c("Allele",labels[reverse_filter])] #Remove any positions with only a single non-zero value p1_good=apply(p1_table,1,function(x){length(which(x>0))>1}) p2_good=apply(p2_table,1,function(x){length(which(x>0))>1}) filter=p1_good & p2_good p1_table=p1_table[filter,] p2_table=p2_table[filter,] #output the a and b allele count tables out_list[[1]]=p1_table out_list[[2]]=p2_table return (out_list) } #Plots normalized coverage using counts list #Markers are filtered based on their noisiness in the euploid control strains # #Arguments: output directory for images, counts list and threshold for filtering noisy markers plot_coverage=function(outdir,counts, noisy_threshold){ #Set alternating grey and black colors for chromosomes customcolors=rep(c("black", "grey70"), 20) silly_cols=as.numeric(as.factor(counts[[2]])) #Output results on a by-marker basis, using the noisy threshold to clean up the marker noise #and only plotting points with at least a count of 10 in the current strain for (i in 1:length(rownames(counts[[1]]))){ #Set output filename and normalize current strain counts (already strain median-normalized) using #euploid strain medians (controls for systemic high- and low-coverage markers caused by fragment length effects etc) out=paste(outdir,rownames(counts[[1]])[i],".png",sep="") current_norm=counts[[4]][i,]/counts[[6]] #No plottting if no markers pass filters filter=counts[[1]][i,] >10 & counts[[7]]0){ png(out,width = 960, height = 480) plot(current_norm[filter],pch=19,cex=0.5,log="y",ylim=c(0.1,10), col=customcolors[silly_cols[filter]],main=rownames(counts[[1]])[i], ylab="Normalized Coverage") dev.off() } } } #Assess marker standard deviation in euploids , allowing highly noisy markers #to be identified and filtered later. Measures SD of log transformed normalized #coverage for each marker in the euploids. # #Returns vector of log SD values as counts list element 7. # #Arguments:counts list (uses element 4 - matrix of counts normalized for each strain and element 6 - #median normalized ratio across all euploids) and euploid strain indices within counts list element 4 add_noise_assess=function(counts,euploid_strain_nos){ #Normalize euploid markers for marker-specific effects on coverage temp_norm=t(apply(counts[[4]][euploid_strain_nos,],1,function(x){x/counts[[6]]})) #Measure SD of log transformed values for each marker, ignoring 0 values counts[[7]]=apply(temp_norm,2,function(x){sd(log(x[x>0]))}) hist(counts[[7]],xlab="SD of Normalized Coverage", main="") return(counts) } #Add vector of euploid medians to counts list as element 6 #based on counts elements 4 (median normalized counts for each strain) # #Returns modified list. # #Arguments: counts list and euploid strain indices as arguments add_euploid_medians=function(counts,euploid_strain_indices){ #Calculate median (ignoring 0 values) for each marker in euploids counts[[6]]=apply(counts[[4]][euploid_strain_indices,],2,function(x){median(x[x>0])}) return(counts) } #Uses offset file to covert coverage marker positions within each chromosome #to genome-wide positions.Takes offset filename and counts list and converts #counts list element 3 (position) to genome-wide version as element 5 # #Returns modified counts list with genome-wide marker positions as element 5 # #Takes counts list and offset filename as arguments add_whole_genome_numbering=function(counts,offsets){ #Get within-chromosome indexing vector wg_pos_names=counts[[3]] #Process chromosomes one at a time for (m in 1:length(offsets[,1])){ #Get offset for current chromosome current_chrom=offsets[m,1] current_offset=as.numeric(offsets[m,2]) #Match chromosome (list element 2) to current chromosome and offset positions accordingly wg_pos_names[counts[[2]]==current_chrom]=wg_pos_names[counts[[2]]==current_chrom]+current_offset } counts[[5]]=wg_pos_names return(counts) } #Input counts file and convert to first element in a list, held as a matrix #with rowname being the strains, and no colnames, each row is a strain and each column #is a marker, values are counts for that strain/marker combo. Second element is a vector #of the chromosome names for the markers. Third element is marker position within its chromosome (bp) #Original counts file should have: first column = strain name, rest of columns #are the marker counts, rows are strains. There may also be a final total counts column #("Total_Reads"), which is removed. First row is chromosome, second row is base position on that chromosome. # #Returns counts list with 3 elements described above # #Arguments: counts filename as argument. input_counts=function(file_name){ #Input data counts=list() counts[[1]]=read.table(file_name, sep="\t", stringsAsFactors=FALSE) #Remove totals column if exists last_col=length(counts[[1]][1,]) if(counts[[1]][1,last_col]=="Total_Reads"){counts[[1]]=counts[[1]][,-last_col]} #Extract rownames rownames(counts[[1]])=counts[[1]][,1] counts[[1]]=counts[[1]][,-1] #Get chrom and position counts[[2]]=as.character(counts[[1]][1,]) counts[[3]]=as.numeric(counts[[1]][2,]) counts[[1]]=counts[[1]][c(-1,-2),] #Convert NA to 0 and make into numeric matrix temp=rownames(counts[[1]]) counts[[1]][is.na(counts[[1]])]=0 counts[[1]]=as.matrix(counts[[1]]) counts[[1]]=apply(counts[[1]],2,as.numeric) rownames(counts[[1]])=temp return(counts) } #Visualize proportion of 0 marker calls by strain and by marker position #allows user to define cutoffs for trimming low-coverage strains and markers # #Arguments: Counts list (uses element 1 - raw counts per strain) assess_cutoffs=function(counts){ #Examine distribution of missing data by row (strain) and by column (marker) col_prop=apply(counts[[1]],2,function(x){x=as.numeric(x); sum(x>0)/length(x)}) row_prop=apply(counts[[1]],1,function(x){x=as.numeric(x); sum(x>0)/length(x)}) hist(col_prop,xlab="Proportion of Alleles Called",main="Markers") hist(row_prop,xlab="Proportion of Alleles Called",main="Strains") } #Remove strains that have too much missing data using cutoff defined by user. # #Returns counts list with modified element 1, with strains failing cutoff removed # #Arguments: counts list and missing strain cutoff trim_rows=function(counts,strain_cutoff){ #Examine distribution of missing data by strain row_prop=apply(counts[[1]],1,function(x){x=as.numeric(x); sum(x>0)/length(x)}) good_strains=c(row_prop>strain_cutoff) counts[[1]]=counts[[1]][good_strains,] return(counts) } #Remove markers that have too much missing data #Cutoff defined by user. # #Returns modified counts list with elements 1 (counts), 2 (marker chromosome) and 3 (marker bp position in chromosome) #trimmed to remove markers with high missing data. # #Arguments: counts list and missing marker cutoff as arguments trim_cols=function(counts,marker_cutoff){ #Examine distribution of missing data by strain and by column col_prop=apply(counts[[1]],2,function(x){x=as.numeric(x); sum(x>0)/length(x)}) good_markers=c(col_prop>marker_cutoff) counts[[1]]=counts[[1]][,good_markers] counts[[2]]=counts[[2]][good_markers] counts[[3]]=counts[[3]][good_markers] return(counts) } #Remove markers that are absent in too many of the euploid control strains. #Cutoff defined by user. # #Returns modified counts list with elements 1 (counts), 2 (marker chromosome) and 3 (marker bp position in chromosome) #trimmed to remove markers with high missing data in euploids. # #Arguments: counts list, euploid strain indices (for euploid rows in counts[[1]]) and missing marker cutoff (proportion) as arguments trim_on_euploids=function(counts,euploid_strain_indices,cutoff){ #Apply missing data marker cutoffs to euploid rows of counts[[1]] good_cols=apply(counts[[1]][euploid_strain_indices,],2,function(x){x=as.numeric(x); sum(x>0)/length(x) >cutoff}) counts[[1]]=counts[[1]][,good_cols] counts[[2]]=counts[[2]][good_cols] counts[[3]]=counts[[3]][good_cols] return(counts) } #Normalize coverage values for each strain to the median value for that strain. #Opeartes on first element of counts list (counts matrix with rows=strains,cols=markers) # #Returns as element 4 in counts list # #Arguments:counts list median_strain_norm=function(counts){ #Divide coverage values by median (of non-zero values) counts[[4]]=t(apply(counts[[1]],1,function(x){x/median(x[x>0])})) return(counts) } #Estimates copy number of each chromosome for each strain based on the proportion #of all aligned reads that align to that chromosome and normalization of those #values to a reference panel of euploid strains. Note will default to a median #copy number of 2 for euploids i.e. will be called as diploids. Default behavior #can be overridden by specifying median chromosome copy number for a strain #in a sepearate ploidy_adjust_file # #Returns: list with following components: #[[1]]=raw aligned read proportion for each strain #[[2]]=proportions normalized to euploid controls #[[3]]=chromosome median normalized to 1 for each strain #[[4]]=ploidy_estimate-element 3 multiplied by factor that gives minimum RMS versus nearest whole number for each chromosome #[[5]]=element 4 rounded to nearest whole number # #Arguments:chromosome_file (rows=strains, columns=proportion of reads aligned to each chromosome), # ploidy_adjust_file (override for strains where median chromosome copy number !=2) normalize_chromosome_file=function(chromosome_file, ploidy_adjust_file){ #List to hold output out=list() #Input proportion of reads on each chromosome to chromosome table #and read in ploidy override table raw_chrom=read.table(chromosome_file,stringsAsFactors=F,sep="\t") ploidy_override=read.table(ploidy_adjust_file,stringsAsFactors=F,sep="\t") #Extract strain names as rownames and keep chr1-R columns ploidy_vect=as.numeric(ploidy_override[,2]) names(ploidy_vect)=ploidy_override[,1] rownames(raw_chrom)=raw_chrom[,1] raw_chrom=raw_chrom[,3:10] #Identify control strains and calculate medians for each chromosome control_nos=c(grep("9318",rownames(raw_chrom)),grep("AF7",rownames(raw_chrom))) control_meds=apply(raw_chrom[control_nos,],2,median) #Divide all chromosomes by control medians first_norm_chrom=t(apply(raw_chrom,1,function(x){x/control_meds})) #Normalize median chromosome of each strain to 1 second_norm_chrom=t(apply(first_norm_chrom,1,function(x){x/median(x)})) #Create matrix of factors from 1.5 to 5.5 test_factors=c(15:55)/10 #Convert to ploidy estimate ploidy_estimate=t(apply(second_norm_chrom,1,function(y){ #Create ploidy values for each level of the factor ploidy_range=t(sapply(test_factors,function(x){x*y})) #Calculate RMS vs nearest whole numbers, note that for euploids this will #give 2 as the best result round_ploidy_range=round(ploidy_range) diff_ploidy=round_ploidy_range-ploidy_range rms_ploidy=apply(diff_ploidy,1,function(x){mean(x**2)**0.5}) #Do not allow ploidy estimates higher than 5 too_high=apply(round_ploidy_range,1,function(x){any(x>5)}) rms_ploidy[too_high]=1000 final_ploidy=ploidy_range[which(rms_ploidy==min(rms_ploidy))[1],] })) #Extend limit factors to cover haploids (where specified in the override table) test_factors=c((1:14)/10,test_factors) #Carry out override for (i in 1:length(ploidy_vect)){ #Get current strain name and the preset median chromosome ploidy strain=names(ploidy_vect)[i] real_ploidy=ploidy_vect[i] #Only use range of factors around override preset so that when rounded median will have #preset value limit_factors=test_factors[test_factors>real_ploidy-0.5 & test_factors