-
Notifications
You must be signed in to change notification settings - Fork 9
/
dnds.py
168 lines (133 loc) · 5.21 KB
/
dnds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""dnds
This module is a reference implementation of estimating nucleotide substitution
neutrality by estimating the percent of synonymous and nonsynonymous mutations.
"""
from __future__ import print_function, division
from math import log
from fractions import Fraction
import logging
from codons import codons
BASES = {'A', 'G', 'T', 'C'}
def split_seq(seq, n=3):
'''Returns sequence split into chunks of n characters, default is codons'''
return [seq[i:i + n] for i in range(0, len(seq), n)]
def average_list(l1, l2):
"""Return the average of two lists"""
return [(i1 + i2) / 2 for i1, i2 in zip(l1, l2)]
def dna_to_protein(codon):
'''Returns single letter amino acid code for given codon'''
return codons[codon]
def translate(seq):
"""Translate a DNA sequence into the 1-letter amino acid sequence"""
return "".join([dna_to_protein(codon) for codon in split_seq(seq)])
def is_synonymous(codon1, codon2):
'''Returns boolean whether given codons are synonymous'''
return dna_to_protein(codon1) == dna_to_protein(codon2)
def dnds_codon(codon):
'''Returns list of synonymous counts for a single codon.
Calculations done per the methodology taught in class.
http://sites.biology.duke.edu/rausher/DNDS.pdf
'''
syn_list = []
for i in range(len(codon)):
base = codon[i]
other_bases = BASES - {base}
syn = 0
for new_base in other_bases:
new_codon = codon[:i] + new_base + codon[i + 1:]
syn += int(is_synonymous(codon, new_codon))
syn_list.append(Fraction(syn, 3))
return syn_list
def dnds_codon_pair(codon1, codon2):
"""Get the dN/dS for the given codon pair"""
return average_list(dnds_codon(codon1), dnds_codon(codon2))
def syn_sum(seq1, seq2):
"""Get the sum of synonymous sites from two DNA sequences"""
syn = 0
codon_list1 = split_seq(seq1)
codon_list2 = split_seq(seq2)
for i in range(len(codon_list1)):
codon1 = codon_list1[i]
codon2 = codon_list2[i]
dnds_list = dnds_codon_pair(codon1, codon2)
syn += sum(dnds_list)
return syn
def hamming(s1, s2):
"""Return the hamming distance between 2 DNA sequences"""
return sum(ch1 != ch2 for ch1, ch2 in zip(s1, s2)) + abs(len(s1) - len(s2))
def codon_subs(codon1, codon2):
"""Returns number of synonymous substitutions in provided codon pair
Methodology for multiple substitutions from Dr. Swanson, UWashington
https://faculty.washington.edu/wjs18/dnds.ppt
"""
diff = hamming(codon1, codon2)
if diff < 1:
return 0
elif diff == 1:
return int(translate(codon1) == translate(codon2))
syn = 0
for i in range(len(codon1)):
base1 = codon1[i]
base2 = codon2[i]
if base1 != base2:
new_codon = codon1[:i] + base2 + codon1[i + 1:]
syn += int(is_synonymous(codon1, new_codon))
syn += int(is_synonymous(codon2, new_codon))
return syn / diff
def substitutions(seq1, seq2):
"""Returns number of synonymous and nonsynonymous substitutions"""
dna_changes = hamming(seq1, seq2)
codon_list1 = split_seq(seq1)
codon_list2 = split_seq(seq2)
syn = 0
for i in range(len(codon_list1)):
codon1 = codon_list1[i]
codon2 = codon_list2[i]
syn += codon_subs(codon1, codon2)
return (syn, dna_changes - syn)
def clean_sequence(seq):
"""Clean up provided sequence by removing whitespace."""
return seq.replace(' ', '')
def pnps(seq1, seq2):
"""Main function to calculate pN/pS between two DNA sequences."""
# Strip any whitespace from both strings
seq1 = clean_sequence(seq1)
seq2 = clean_sequence(seq2)
# Check that both sequences have the same length
assert len(seq1) == len(seq2)
# Check that sequences are codons
assert len(seq1) % 3 == 0
syn_sites = syn_sum(seq1, seq2)
non_sites = len(seq1) - syn_sites
logging.info('Sites (syn/nonsyn): {}, {}'.format(syn_sites, non_sites))
syn_subs, non_subs = substitutions(seq1, seq2)
logging.info('pN: {} / {}\t\tpS: {} / {}'
.format(non_subs, round(non_sites), syn_subs, round(syn_sites)))
pn = non_subs / non_sites
ps = syn_subs / syn_sites
return pn / ps
def dnds(seq1, seq2):
"""Main function to calculate dN/dS between two DNA sequences per Nei &
Gojobori 1986. This includes the per site conversion adapted from Jukes &
Cantor 1967.
"""
# Strip any whitespace from both strings
seq1 = clean_sequence(seq1)
seq2 = clean_sequence(seq2)
# Check that both sequences have the same length
assert len(seq1) == len(seq2)
# Check that sequences are codons
assert len(seq1) % 3 == 0
syn_sites = syn_sum(seq1, seq2)
non_sites = len(seq1) - syn_sites
logging.info('Sites (syn/nonsyn): {}, {}'.format(syn_sites, non_sites))
syn_subs, non_subs = substitutions(seq1, seq2)
pn = non_subs / non_sites
ps = syn_subs / syn_sites
dn = -(3 / 4) * log(1 - (4 * pn / 3))
ds = -(3 / 4) * log(1 - (4 * ps / 3))
logging.info('dN: {}\t\tdS: {}'.format(round(dn, 3), round(ds, 3)))
return dn / ds
if __name__ == '__main__':
print(pnps('ACC GTG GGA TGC ACC GGT GTG CCC',
'ACA GTG AGA TAT AAA GGA GAG AAC'))