In [1]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import re
import numpy as np
import pandas as pd

import SequenceDataORM_updt as sqd
import circularspacingtests as cst

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('C:/Users/Nicholas Sherer/.matplotlib/ShererPaper.mplstyle')
In [2]:
engine = create_engine('sqlite:///NS001_evolved_mutations_copy8.db', echo=False) # our database connection
session = sessionmaker(bind=engine)() # the session object is how we make queries through sqlalchemy
In [3]:
cd "Paper Figures"
C:\Users\Nicholas Sherer\Box\N_Sherer_Thesis_Materials\SeqDBandNBs\Paper Figures

Group snps that should have been grouped

While doing earlier runs of this analysis, I found many clusters of mutations that should have been classified as a single mutational event. They occurred in only one well and on the same set of reads. This meant the cluster finding strategy I took from "Detecting Clusters of Mutations" by Zhou et al. was finding not only clusters due to natural selection but multiple mutations that should have been one mutation event.

In [4]:
snp_mutations = [mut for mut in session.query(sqd.SNP_Mutation).order_by(sqd.SNP_Mutation.chr_position)]
In [5]:
def snp_covs(snp_mut):
    smpls = snp_mut.samples
    new_covs = [(session.query(sqd.SNP_Evidence)
                        .filter(sqd.SNP_Evidence.sample == smpl)
                        .filter(sqd.SNP_Evidence.chr_position == snp_mut.chr_position)
                        .filter(sqd.SNP_Evidence.ref_base == snp_mut.ref_base)
                        .filter(sqd.SNP_Evidence.new_base == snp_mut.new_base)
                        .first()).new_cov for smpl in smpls]
    return new_covs

def is_close_cov(cov1, cov2):
    x1, y1 = [int(word) for word in cov1.split('/')]
    x2, y2 = [int(word) for word in cov2.split('/')]
    return ((x1 == x2) and (np.abs(y1-y2) <= 1)) or ((np.abs(x1 - x2) <= 1) and (y1 == y2))

def merge_snps(sorted_snp_list):
    merged_list = [sorted_snp_list[0]]
    curr_samples = merged_list[0].samples
    curr_position = merged_list[0].chr_position
    curr_covs = snp_covs(merged_list[0])
    for snp in sorted_snp_list[1:]:
        if (snp.chr_position <= curr_position + 100 and 
            set(snp.samples) == set(curr_samples) and
            np.all([is_close_cov(cov, curr_cov) for cov, curr_cov in zip(snp_covs(snp), curr_covs)])):
                pass
        else:
            merged_list.append(snp)
            curr_samples = snp.samples
            curr_position = snp.chr_position
            curr_covs = snp_covs(snp)
    return merged_list
In [6]:
len(snp_mutations)
Out[6]:
937
In [7]:
merged_snps = merge_snps(snp_mutations)
In [8]:
len(merged_snps)
Out[8]:
909

Filter out ancestral mutations

In [9]:
nonancestral_mutations = [snp for snp in merged_snps
                          if ('Aggregate_NS001_Ancestors' not in snp.samples) and
                             ('Ancestor_S1' not in snp.samples) and
                             ('Ancestor_S2' not in snp.samples) and
                             ('Ancestor_S3' not in snp.samples)]
In [10]:
len(nonancestral_mutations)
Out[10]:
846

Filter out undetected ancestral mutations

Any mutations that appear in two different wells almost certainly were in the ancestor but undetected. Let's filter those out.

In [11]:
nonancestral_mutations = [snp for snp in nonancestral_mutations if len(snp.samples) <= 2]
In [12]:
len(nonancestral_mutations)
Out[12]:
825

Remove synonymous mutations

In [13]:
nonsyn_nonanc_mutations = [snp for snp in nonancestral_mutations if not snp.synonymous]
In [14]:
len(nonsyn_nonanc_mutations)
Out[14]:
488

Count up the number of unique mutations found by gene

In [15]:
count_dict = {}
for mut in nonsyn_nonanc_mutations:
    if mut.gene in count_dict:
        count_dict[mut.gene] = count_dict[mut.gene] + 1
    else:
        count_dict[mut.gene] = 1
mutation_counts = pd.Series(pd.Series(count_dict).sort_values(ascending=False))
In [16]:
len(mutation_counts)
Out[16]:
440
In [17]:
np.sum(mutation_counts > 1)
Out[17]:
30
In [18]:
mutation_counts[:31]
Out[18]:
sufB    8
rne     6
rhsD    4
ptsP    4
rpoB    3
yeeJ    3
sufC    3
glgX    3
yhdP    2
eptC    2
mhpF    2
ompT    2
entS    2
rhsC    2
dtpD    2
ftsK    2
psuG    2
etk     2
dnaE    2
barA    2
kgtP    2
ydeT    2
ynfF    2
acrD    2
yfbS    2
menD    2
rcsC    2
parC    2
cobS    2
yjjB    2
lpxB    1
dtype: int64

Calculate the probability that a cluster of mutations would fit within each gene given 1000 mutations across the entire chromosome

We found 825 nonancestral mutations and only looked at nonsynonymous mutations within each gene, but to avoid false positives, we'll calculate how often 1000 mutations randomly distributed throughout the E. coli chromosome would lead to a cluster of mutations as large as the cluster in each gene and in a length less than or equal to that of the gene.

We only need to look at genes that had 2 or more mutations since we're looking at the likelihood of multiple mutations clustered together.

We calculate the probability of a cluster by simulation and will be using the traditional p_value cutoff of .05 (our simulation method means this is already corrected for multiple comparisions across the whole chromosome) so we only take 100,000 draws for our simulations to calculate the probability each cluster would occur by chance. If a 0 is returned that means almost certainly p < 10^-3 and likely less than 10^-4.

In [19]:
TOTAL_MUTATIONS = 1000
CHROMOSOME_LENGTH = 4641652

for gene, n_mutations in list(mutation_counts.items())[:30]:
    length = session.query(sqd.Gene).filter(sqd.Gene.name==gene).first().length
    p = cst.p_extreme_cluster(n_mutations, length, TOTAL_MUTATIONS, CHROMOSOME_LENGTH, max_draws=10**5)
    print(gene, n_mutations, length, p)
sufB 8 1488 8e-05
rne 6 3186 0.43234
rhsD 4 4281 1.0
ptsP 4 2247 1.0
rpoB 3 4029 1.0
yeeJ 3 7077 1.0
sufC 3 747 0.99999
glgX 3 1974 1.0
yhdP 2 3801 1.0
eptC 2 1734 1.0
mhpF 2 951 1.0
ompT 2 954 1.0
entS 2 1251 1.0
rhsC 2 4194 1.0
dtpD 2 1482 1.0
ftsK 2 3990 1.0
psuG 2 939 1.0
etk 2 2181 1.0
dnaE 2 3483 1.0
barA 2 2757 1.0
kgtP 2 1299 1.0
ydeT 2 1149 1.0
ynfF 2 2424 1.0
acrD 2 3114 1.0
yfbS 2 1833 1.0
menD 2 1671 1.0
rcsC 2 2850 1.0
parC 2 2259 1.0
cobS 2 744 1.0
yjjB 2 474 1.0

Inspect the individual mutations in genes that passed the p value cutoff for the chance there would be a random cluster of that size given 1000 mutations across the entire chromosome

The only gene that passes this threshold is the sufB gene.

In [20]:
mutation_ev_by_gene = {}
for index in pd.Series(count_dict).index:
    mutation_ev_by_gene[index] = [ev for ev in (session.query(sqd.SNP_Evidence)
                                                           .join(sqd.SNP_Mutation)
                                                           .filter(sqd.SNP_Mutation.gene==index)
                                                           .order_by(sqd.SNP_Evidence.chr_position,
                                                                     sqd.SNP_Evidence.sample))]
In [21]:
for mut in mutation_ev_by_gene['sufB']:
    print(mut, mut.frequency)
<SNP_Evidence(sample=Hi4t1_S1, chr_position=1761840, ref_base=C, new_base=T)> 0.872701645
<SNP_Evidence(sample=Hi4t2_S1, chr_position=1761840, ref_base=C, new_base=T)> 0.18745327
<SNP_Evidence(sample=Hi3t1_S1, chr_position=1762024, ref_base=G, new_base=A)> 0.0707144737
<SNP_Evidence(sample=Hi1t1_S1, chr_position=1762267, ref_base=A, new_base=G)> 0.407216549
<SNP_Evidence(sample=Hi1t2_S1, chr_position=1762291, ref_base=A, new_base=G)> 0.193992615
<SNP_Evidence(sample=HiMid2t2_S1, chr_position=1762299, ref_base=C, new_base=T)> 0.357156754
<SNP_Evidence(sample=Hi3t1_S1, chr_position=1762315, ref_base=A, new_base=G)> 0.337581158
<SNP_Evidence(sample=HiMid3t1_S1, chr_position=1762497, ref_base=C, new_base=T)> 0.945244312
<SNP_Evidence(sample=HiMid3t2_S1, chr_position=1762497, ref_base=C, new_base=T)> 0.837644577
<SNP_Evidence(sample=HiMid4t1_S1, chr_position=1762620, ref_base=T, new_base=C)> 0.104299068

Repeat the same analysis for deletion mutations

First filter out ancestral mutations. There is no such thing as a synonymous deletion so we skip that step. We discovered the problem with unmerged mutations for deletions before loading the database with the breseq run data, so that step has already been handled.

In [22]:
del_mutations = [mut for mut in session.query(sqd.DEL_Mutation).order_by(sqd.DEL_Mutation.chr_position)]
In [23]:
len(del_mutations)
Out[23]:
39
In [24]:
nonancestral_deletions = [dele for dele in del_mutations
                          if ('Aggregate_NS001_Ancestors' not in dele.samples) and
                             ('Ancestor_S1' not in dele.samples) and
                             ('Ancestor_S2' not in dele.samples) and
                             ('Ancestor_S3' not in dele.samples)]
nonancestral_deletions = [dele for dele in nonancestral_deletions if len(dele.samples) <= 2]
In [25]:
len(nonancestral_deletions)
Out[25]:
35

Now count up the number of deletions per gene

In [26]:
count_dele = {}
for dele in nonancestral_deletions:
    if dele.gene in count_dele:
        count_dele[dele.gene] = count_dele[dele.gene] + 1
    else:
        count_dele[dele.gene] = 1
deletion_counts = pd.Series(pd.Series(count_dele).sort_values(ascending=False))
In [27]:
deletion_counts
Out[27]:
sufS         3
creD         1
cysP         1
mhpR         1
lon          1
mscK         1
cydA         1
pgaB         1
csgE         1
adhE         1
ydcL         1
dtpA/gstA    1
ydhK         1
cfa          1
astB         1
fliA         1
stpA         1
dcuS         1
xdhA         1
yhbE         1
rapZ         1
prkB         1
glgP         1
yhjG         1
bcsB         1
dppA/proK    1
waaQ/waaA    1
dfp          1
dinD/yicG    1
fadA         1
frvR         1
phnG         1
aceE         1
dtype: int64

The only gene with multiple deletions is sufS. Let's check the p-value for having this many deletions

In [28]:
TOTAL_DELETIONS = 40
length_sufS = session.query(sqd.Gene).filter(sqd.Gene.name=='sufS').first().length
p_sufS = cst.p_extreme_cluster(3, length_sufS, TOTAL_DELETIONS, CHROMOSOME_LENGTH, max_draws=10**5)
print('sufS', 3, length_sufS, p_sufS)
sufS 3 1221 0.002

That passes the threshold for significance (and is already corrected for multiple comparisons)

In [29]:
del_ev_by_gene = {}
for index in pd.Series(count_dele).index:
    del_ev_by_gene[index] = [ev for ev in (session.query(sqd.DEL_Evidence)
                                                            .join(sqd.DEL_Mutation)
                                                            .filter(sqd.DEL_Mutation.gene==index)
                                                            .order_by(sqd.DEL_Evidence.chr_position,
                                                                      sqd.DEL_Evidence.sample))]
In [30]:
for dele in del_ev_by_gene['sufS']:
    print(dele, dele.frequency)
<DEL_Evidence(sample=Hi4t1_S1, chr_position=1758593, size=1 0.112882614
<DEL_Evidence(sample=Hi4t2_S1, chr_position=1758593, size=1 0.517487526
<DEL_Evidence(sample=Mid2t1_S1, chr_position=1759347, size=1 0.0850977898
<DEL_Evidence(sample=Hi1t1_S1, chr_position=1759408, size=4 0.370708661

Repeat the analysis for insertion mutations

First filter out ancestral mutations. There is no such thing as a synonymous insertion so we skip that step. We discovered the problem with unmerged mutations for insertion before loading the database with the breseq run data, so that step has already been handled.

In [31]:
ins_mutations = [mut for mut in session.query(sqd.INS_Mutation).order_by(sqd.INS_Mutation.chr_position)]
In [32]:
len(ins_mutations)
Out[32]:
21
In [33]:
nonancestral_insertions = [ins for ins in ins_mutations
                           if ('Aggregate_NS001_Ancestors' not in ins.samples) and
                              ('Ancestor_S1' not in ins.samples) and
                              ('Ancestor_S2' not in ins.samples) and
                              ('Ancestor_S3' not in ins.samples)]
nonancestral_insertions = [ins for ins in nonancestral_insertions if len(ins.samples) <= 2]
In [34]:
len(nonancestral_insertions)
Out[34]:
18

Now count up the number of insertions per gene

In [35]:
count_ins = {}
for ins in nonancestral_insertions:
    if ins.gene in count_ins:
        count_ins[ins.gene] = count_ins[ins.gene] + 1
    else:
        count_ins[ins.gene] = 1
insertion_counts = pd.Series(pd.Series(count_ins).sort_values(ascending=False))
In [36]:
insertion_counts
Out[36]:
ytfT         1
aspA         1
yccE         1
grxB/mdtH    1
ymfE/lit     1
sufS         1
uvrY         1
ada          1
rodZ/rlmN    1
purL/mltF    1
pgeF         1
yfjM/rnlA    1
mtr          1
glpR         1
bcsZ         1
asnA         1
phnG         1
yagG         1
dtype: int64

There are no genes with multiple nonancestral insertions.

In [37]:
ins_ev_by_gene = {}
for index in pd.Series(count_ins).index:
    ins_ev_by_gene[index] = [ev for ev in (session.query(sqd.INS_Evidence)
                                                            .join(sqd.INS_Mutation)
                                                            .filter(sqd.INS_Mutation.gene==index)
                                                            .order_by(sqd.INS_Evidence.chr_position,
                                                                      sqd.INS_Evidence.sample))]
In [38]:
for ins in ins_ev_by_gene['sufS']:
    print(ins, ins.frequency)
<INS_Evidence(sample=HiMid4t2_S1, chr_position=1758899, new_seq=C 1.0
<INS_Evidence(sample=Mid1t1_S1, chr_position=1758899, new_seq=C 1.0
<INS_Evidence(sample=Mid1t2_S1, chr_position=1758899, new_seq=C 1.0
<INS_Evidence(sample=Mid4t1_S1, chr_position=1758992, new_seq=C 1.0
<INS_Evidence(sample=Mid4t2_S1, chr_position=1758992, new_seq=C 1.0

There are only 3 mobile element insertions total so we skip the full analysis of mobile element insertions and instead just show the evidence for these three

In [39]:
for mob in session.query(sqd.MOB_Evidence).order_by(sqd.MOB_Evidence.sample):
    print(mob, mob.frequency)
<MOB_Evidence(sample=Lo2t2_S1, chr_position=252489, repeat_name=IS1 0.716814159
<MOB_Evidence(sample=LoMid2t2_S1, chr_position=2381501, repeat_name=IS1 0.549295775
<MOB_Evidence(sample=Mid3t1_S1, chr_position=1758639, repeat_name=IS2 0.831775701
<MOB_Evidence(sample=Mid3t2_S1, chr_position=1758639, repeat_name=IS2 0.942857143

What if we look at the SNPs in the whole suf operon?

Although only sufB passes the threshold of statistical significance, there are 3 SNPs in sufC as well, and maybe a mutation in sufA, sufD, sufE, or sufS. What happens if we look at the entire operons length vs the number of mutations in it?

In [40]:
sufs = ['suf' + letter for letter in ['A', 'B', 'C', 'D', 'E', 'S']]
suf_operon_length = 0
suf_snp_count = 0

print('For each gene individually, the number of mutations, gene length, and p_value of the cluster is')
for gene in sufs:
    try:
        n_snps = mutation_counts[gene]
    except KeyError:
        n_snps = 0
    suf_snp_count = suf_snp_count + n_snps
    length = session.query(sqd.Gene).filter(sqd.Gene.name==gene).first().length
    suf_operon_length = suf_operon_length + length
    p = cst.p_extreme_cluster(n_snps, length, TOTAL_MUTATIONS, CHROMOSOME_LENGTH, max_draws=10**5)
    print(gene, n_snps, length, p)
For each gene individually, the number of mutations, gene length, and p_value of the cluster is
sufA 0 369 0.10285
sufB 8 1488 3e-05
sufC 3 747 0.99997
sufD 0 1272 0.10228
sufE 1 417 1.0
sufS 1 1221 1.0
In [41]:
print('For the entire suf operon, the number of mutations, operon_length, and p_value of the cluster is')
p_operon = cst.p_extreme_cluster(suf_snp_count, suf_operon_length, TOTAL_MUTATIONS, CHROMOSOME_LENGTH, max_draws=10**6)
print('suf operon', suf_snp_count, suf_operon_length, p_operon)
For the entire suf operon, the number of mutations, operon_length, and p_value of the cluster is
suf operon 13 5514 3e-06
In [ ]: