from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import re
import numpy as np
import pandas as pd
import SequenceDataORM_updt as sqd
import circularspacingtests as cst
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('C:/Users/Nicholas Sherer/.matplotlib/ShererPaper.mplstyle')
engine = create_engine('sqlite:///NS001_evolved_mutations_copy8.db', echo=False) # our database connection
session = sessionmaker(bind=engine)() # the session object is how we make queries through sqlalchemy
cd "Paper Figures"
While doing earlier runs of this analysis, I found many clusters of mutations that should have been classified as a single mutational event. They occurred in only one well and on the same set of reads. This meant the cluster finding strategy I took from "Detecting Clusters of Mutations" by Zhou et al. was finding not only clusters due to natural selection but multiple mutations that should have been one mutation event.
snp_mutations = [mut for mut in session.query(sqd.SNP_Mutation).order_by(sqd.SNP_Mutation.chr_position)]
def snp_covs(snp_mut):
smpls = snp_mut.samples
new_covs = [(session.query(sqd.SNP_Evidence)
.filter(sqd.SNP_Evidence.sample == smpl)
.filter(sqd.SNP_Evidence.chr_position == snp_mut.chr_position)
.filter(sqd.SNP_Evidence.ref_base == snp_mut.ref_base)
.filter(sqd.SNP_Evidence.new_base == snp_mut.new_base)
.first()).new_cov for smpl in smpls]
return new_covs
def is_close_cov(cov1, cov2):
x1, y1 = [int(word) for word in cov1.split('/')]
x2, y2 = [int(word) for word in cov2.split('/')]
return ((x1 == x2) and (np.abs(y1-y2) <= 1)) or ((np.abs(x1 - x2) <= 1) and (y1 == y2))
def merge_snps(sorted_snp_list):
merged_list = [sorted_snp_list[0]]
curr_samples = merged_list[0].samples
curr_position = merged_list[0].chr_position
curr_covs = snp_covs(merged_list[0])
for snp in sorted_snp_list[1:]:
if (snp.chr_position <= curr_position + 100 and
set(snp.samples) == set(curr_samples) and
np.all([is_close_cov(cov, curr_cov) for cov, curr_cov in zip(snp_covs(snp), curr_covs)])):
pass
else:
merged_list.append(snp)
curr_samples = snp.samples
curr_position = snp.chr_position
curr_covs = snp_covs(snp)
return merged_list
len(snp_mutations)
merged_snps = merge_snps(snp_mutations)
len(merged_snps)
nonancestral_mutations = [snp for snp in merged_snps
if ('Aggregate_NS001_Ancestors' not in snp.samples) and
('Ancestor_S1' not in snp.samples) and
('Ancestor_S2' not in snp.samples) and
('Ancestor_S3' not in snp.samples)]
len(nonancestral_mutations)
Any mutations that appear in two different wells almost certainly were in the ancestor but undetected. Let's filter those out.
nonancestral_mutations = [snp for snp in nonancestral_mutations if len(snp.samples) <= 2]
len(nonancestral_mutations)
nonsyn_nonanc_mutations = [snp for snp in nonancestral_mutations if not snp.synonymous]
len(nonsyn_nonanc_mutations)
count_dict = {}
for mut in nonsyn_nonanc_mutations:
if mut.gene in count_dict:
count_dict[mut.gene] = count_dict[mut.gene] + 1
else:
count_dict[mut.gene] = 1
mutation_counts = pd.Series(pd.Series(count_dict).sort_values(ascending=False))
len(mutation_counts)
np.sum(mutation_counts > 1)
mutation_counts[:31]
We found 825 nonancestral mutations and only looked at nonsynonymous mutations within each gene, but to avoid false positives, we'll calculate how often 1000 mutations randomly distributed throughout the E. coli chromosome would lead to a cluster of mutations as large as the cluster in each gene and in a length less than or equal to that of the gene.
We only need to look at genes that had 2 or more mutations since we're looking at the likelihood of multiple mutations clustered together.
We calculate the probability of a cluster by simulation and will be using the traditional p_value cutoff of .05 (our simulation method means this is already corrected for multiple comparisions across the whole chromosome) so we only take 100,000 draws for our simulations to calculate the probability each cluster would occur by chance. If a 0 is returned that means almost certainly p < 10^-3 and likely less than 10^-4.
TOTAL_MUTATIONS = 1000
CHROMOSOME_LENGTH = 4641652
for gene, n_mutations in list(mutation_counts.items())[:30]:
length = session.query(sqd.Gene).filter(sqd.Gene.name==gene).first().length
p = cst.p_extreme_cluster(n_mutations, length, TOTAL_MUTATIONS, CHROMOSOME_LENGTH, max_draws=10**5)
print(gene, n_mutations, length, p)
The only gene that passes this threshold is the sufB gene.
mutation_ev_by_gene = {}
for index in pd.Series(count_dict).index:
mutation_ev_by_gene[index] = [ev for ev in (session.query(sqd.SNP_Evidence)
.join(sqd.SNP_Mutation)
.filter(sqd.SNP_Mutation.gene==index)
.order_by(sqd.SNP_Evidence.chr_position,
sqd.SNP_Evidence.sample))]
for mut in mutation_ev_by_gene['sufB']:
print(mut, mut.frequency)
First filter out ancestral mutations. There is no such thing as a synonymous deletion so we skip that step. We discovered the problem with unmerged mutations for deletions before loading the database with the breseq run data, so that step has already been handled.
del_mutations = [mut for mut in session.query(sqd.DEL_Mutation).order_by(sqd.DEL_Mutation.chr_position)]
len(del_mutations)
nonancestral_deletions = [dele for dele in del_mutations
if ('Aggregate_NS001_Ancestors' not in dele.samples) and
('Ancestor_S1' not in dele.samples) and
('Ancestor_S2' not in dele.samples) and
('Ancestor_S3' not in dele.samples)]
nonancestral_deletions = [dele for dele in nonancestral_deletions if len(dele.samples) <= 2]
len(nonancestral_deletions)
count_dele = {}
for dele in nonancestral_deletions:
if dele.gene in count_dele:
count_dele[dele.gene] = count_dele[dele.gene] + 1
else:
count_dele[dele.gene] = 1
deletion_counts = pd.Series(pd.Series(count_dele).sort_values(ascending=False))
deletion_counts
The only gene with multiple deletions is sufS. Let's check the p-value for having this many deletions
TOTAL_DELETIONS = 40
length_sufS = session.query(sqd.Gene).filter(sqd.Gene.name=='sufS').first().length
p_sufS = cst.p_extreme_cluster(3, length_sufS, TOTAL_DELETIONS, CHROMOSOME_LENGTH, max_draws=10**5)
print('sufS', 3, length_sufS, p_sufS)
That passes the threshold for significance (and is already corrected for multiple comparisons)
del_ev_by_gene = {}
for index in pd.Series(count_dele).index:
del_ev_by_gene[index] = [ev for ev in (session.query(sqd.DEL_Evidence)
.join(sqd.DEL_Mutation)
.filter(sqd.DEL_Mutation.gene==index)
.order_by(sqd.DEL_Evidence.chr_position,
sqd.DEL_Evidence.sample))]
for dele in del_ev_by_gene['sufS']:
print(dele, dele.frequency)
First filter out ancestral mutations. There is no such thing as a synonymous insertion so we skip that step. We discovered the problem with unmerged mutations for insertion before loading the database with the breseq run data, so that step has already been handled.
ins_mutations = [mut for mut in session.query(sqd.INS_Mutation).order_by(sqd.INS_Mutation.chr_position)]
len(ins_mutations)
nonancestral_insertions = [ins for ins in ins_mutations
if ('Aggregate_NS001_Ancestors' not in ins.samples) and
('Ancestor_S1' not in ins.samples) and
('Ancestor_S2' not in ins.samples) and
('Ancestor_S3' not in ins.samples)]
nonancestral_insertions = [ins for ins in nonancestral_insertions if len(ins.samples) <= 2]
len(nonancestral_insertions)
count_ins = {}
for ins in nonancestral_insertions:
if ins.gene in count_ins:
count_ins[ins.gene] = count_ins[ins.gene] + 1
else:
count_ins[ins.gene] = 1
insertion_counts = pd.Series(pd.Series(count_ins).sort_values(ascending=False))
insertion_counts
There are no genes with multiple nonancestral insertions.
ins_ev_by_gene = {}
for index in pd.Series(count_ins).index:
ins_ev_by_gene[index] = [ev for ev in (session.query(sqd.INS_Evidence)
.join(sqd.INS_Mutation)
.filter(sqd.INS_Mutation.gene==index)
.order_by(sqd.INS_Evidence.chr_position,
sqd.INS_Evidence.sample))]
for ins in ins_ev_by_gene['sufS']:
print(ins, ins.frequency)
for mob in session.query(sqd.MOB_Evidence).order_by(sqd.MOB_Evidence.sample):
print(mob, mob.frequency)
Although only sufB passes the threshold of statistical significance, there are 3 SNPs in sufC as well, and maybe a mutation in sufA, sufD, sufE, or sufS. What happens if we look at the entire operons length vs the number of mutations in it?
sufs = ['suf' + letter for letter in ['A', 'B', 'C', 'D', 'E', 'S']]
suf_operon_length = 0
suf_snp_count = 0
print('For each gene individually, the number of mutations, gene length, and p_value of the cluster is')
for gene in sufs:
try:
n_snps = mutation_counts[gene]
except KeyError:
n_snps = 0
suf_snp_count = suf_snp_count + n_snps
length = session.query(sqd.Gene).filter(sqd.Gene.name==gene).first().length
suf_operon_length = suf_operon_length + length
p = cst.p_extreme_cluster(n_snps, length, TOTAL_MUTATIONS, CHROMOSOME_LENGTH, max_draws=10**5)
print(gene, n_snps, length, p)
print('For the entire suf operon, the number of mutations, operon_length, and p_value of the cluster is')
p_operon = cst.p_extreme_cluster(suf_snp_count, suf_operon_length, TOTAL_MUTATIONS, CHROMOSOME_LENGTH, max_draws=10**6)
print('suf operon', suf_snp_count, suf_operon_length, p_operon)