diff --git a/README.rst b/README.rst index 9cada6a..3038fc9 100644 --- a/README.rst +++ b/README.rst @@ -8,7 +8,8 @@ dN/dS Calculator :target: https://pypi.python.org/pypi/dnds Calculate dN/dS ratio precisely (Ka/Ks) using a codon-by-codon counting -method. +method. Also calculates the pN/pS ratio precisely (previously referred to as +dN/dS). Usage ----- @@ -17,7 +18,9 @@ Usage >>> sequence_1 = "ATGCTTTTGAAATCG" >>> sequence_2 = "ATGCGTTCGAAGTCG" - >>> dnds(sequence_1, sequence2) + >>> pnps(sequence_1, sequence2) Fraction(38, 71) - >>> round(float(dnds(sequence_1, sequence2)), 3) + >>> round(float(pnps(sequence_1, sequence2)), 3) 0.535 + >>> round(dnds(sequence_1, sequence_2), 3) + 0.467 diff --git a/dnds.py b/dnds.py index e982fbc..e7477b1 100644 --- a/dnds.py +++ b/dnds.py @@ -1,6 +1,13 @@ +"""dnds + +This module is a reference implementation of estimating nucleotide substitution +neutrality by estimating the percent of synonymous and nonsynonymous mutations. +""" from __future__ import print_function, division -from codons import codons +from math import log from fractions import Fraction +import logging +from codons import codons BASES = {'A', 'G', 'T', 'C'} @@ -80,16 +87,16 @@ def codon_subs(codon1, codon2): return 0 elif diff == 1: return int(translate(codon1) == translate(codon2)) - else: - syn = 0 - for i in range(len(codon1)): - base1 = codon1[i] - base2 = codon2[i] - if base1 != base2: - new_codon = codon1[:i] + base2 + codon1[i + 1:] - syn += int(is_synonymous(codon1, new_codon)) - syn += int(is_synonymous(codon2, new_codon)) - return syn / diff + + syn = 0 + for i in range(len(codon1)): + base1 = codon1[i] + base2 = codon2[i] + if base1 != base2: + new_codon = codon1[:i] + base2 + codon1[i + 1:] + syn += int(is_synonymous(codon1, new_codon)) + syn += int(is_synonymous(codon2, new_codon)) + return syn / diff def substitutions(seq1, seq2): @@ -105,27 +112,57 @@ def substitutions(seq1, seq2): return (syn, dna_changes - syn) +def clean_sequence(seq): + """Clean up provided sequence by removing whitespace.""" + return seq.replace(' ', '') + + +def pnps(seq1, seq2): + """Main function to calculate pN/pS between two DNA sequences.""" + # Strip any whitespace from both strings + seq1 = clean_sequence(seq1) + seq2 = clean_sequence(seq2) + # Check that both sequences have the same length + assert len(seq1) == len(seq2) + # Check that sequences are codons + assert len(seq1) % 3 == 0 + + syn_sites = syn_sum(seq1, seq2) + non_sites = len(seq1) - syn_sites + logging.info('Sites (syn/nonsyn): {}, {}'.format(syn_sites, non_sites)) + syn_subs, non_subs = substitutions(seq1, seq2) + logging.info('pN: {} / {}\t\tpS: {} / {}' + .format(non_subs, round(non_sites), syn_subs, round(syn_sites))) + pn = non_subs / non_sites + ps = syn_subs / syn_sites + return pn / ps + + def dnds(seq1, seq2): - """Main function to calculate dN/dS between two DNA sequences""" + """Main function to calculate dN/dS between two DNA sequences per Nei & + Gojobori 1986. This includes the per site conversion adapted from Jukes & + Cantor 1967. + """ # Strip any whitespace from both strings - seq1 = seq1.replace(' ', '') - seq2 = seq2.replace(' ', '') + seq1 = clean_sequence(seq1) + seq2 = clean_sequence(seq2) # Check that both sequences have the same length assert len(seq1) == len(seq2) # Check that sequences are codons assert len(seq1) % 3 == 0 - assert len(seq2) % 3 == 0 + syn_sites = syn_sum(seq1, seq2) non_sites = len(seq1) - syn_sites - # print(syn_sites, non_sites) + logging.info('Sites (syn/nonsyn): {}, {}'.format(syn_sites, non_sites)) syn_subs, non_subs = substitutions(seq1, seq2) - # print('dN: {} / {}\t\tdS: {} / {}' - # .format(non_subs, round(non_sites), syn_subs, round(syn_sites))) - dn = non_subs / non_sites - ds = syn_subs / syn_sites + pn = non_subs / non_sites + ps = syn_subs / syn_sites + dn = -(3 / 4) * log(1 - (4 * pn / 3)) + ds = -(3 / 4) * log(1 - (4 * ps / 3)) + logging.info('dN: {}\t\tdS: {}'.format(round(dn, 3), round(ds, 3))) return dn / ds if __name__ == '__main__': - print(dnds('ACC GTG GGA TGC ACC GGT GTG CCC', + print(pnps('ACC GTG GGA TGC ACC GGT GTG CCC', 'ACA GTG AGA TAT AAA GGA GAG AAC')) diff --git a/setup.py b/setup.py index 3432a1a..739455b 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ version=version, description="Calculate dN/dS ratio precisely (Ka/Ks) using a codon-by-codon counting method.", author="Adel Qalieh", - author_email="adelq@sas.upenn.edu", + author_email="adelq@med.umich.edu", url="https://github.com/adelq/dnds", license="MIT", py_modules=['dnds', 'codons'], @@ -22,4 +22,5 @@ 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', ]) diff --git a/test_dnds.py b/test_dnds.py index ede66b2..8d7506c 100644 --- a/test_dnds.py +++ b/test_dnds.py @@ -1,6 +1,7 @@ -from dnds import dnds, substitutions, dnds_codon, dnds_codon_pair, syn_sum, translate +from __future__ import division from fractions import Fraction from nose.tools import assert_equal, assert_almost_equal +from dnds import dnds, pnps, substitutions, dnds_codon, dnds_codon_pair, syn_sum, translate # From Canvas practice problem TEST_SEQ1 = 'ACTCCGAACGGGGCGTTAGAGTTGAAACCCGTTAGA' @@ -8,6 +9,9 @@ # From in-class problem set TEST_SEQ3 = 'ATGCTTTTGAAATCGATCGTTCGTTCACATCGATGGATC' TEST_SEQ4 = 'ATGCGTTCGAAGTCGATCGATCGCTCAGATCGATCGATC' +# From http://bioinformatics.cvr.ac.uk/blog/calculating-dnds-for-ngs-datasets/ +TEST_SEQ5 = 'ATGAAACCCGGGTTTTAA' +TEST_SEQ6 = 'ATGAAACGCGGCTACTAA' def test_translate(): @@ -77,9 +81,14 @@ def test_syn_subs(): assert_equal(substitutions("CCC", "AAC"), (0, 2)) +def test_pnps(): + assert_almost_equal(pnps(TEST_SEQ1, TEST_SEQ2), 0.269, delta=0.1) + assert_almost_equal(pnps(TEST_SEQ3, TEST_SEQ4), 0.86, delta=0.1) + assert_almost_equal(pnps(TEST_SEQ5, TEST_SEQ6), 0.1364 / 0.6001, delta=1e-4) + + def test_dnds(): - assert_almost_equal(dnds(TEST_SEQ1, TEST_SEQ2), 0.269, delta=0.1) - assert_almost_equal(dnds(TEST_SEQ3, TEST_SEQ4), 0.86, delta=0.1) + assert_almost_equal(dnds(TEST_SEQ5, TEST_SEQ6), 0.1247, delta=1e-4) # DNDS.pdf - wrong based on email conversation with Sean #