From 5a18033015c2a5df84a5b40a1d28bb5d96ff09d3 Mon Sep 17 00:00:00 2001 From: Thomas Buida Date: Mon, 5 Feb 2024 19:39:00 -0600 Subject: [PATCH] Finish codon-based trimming --- clipkit/api.py | 2 + clipkit/clipkit.py | 8 +++- clipkit/msa.py | 47 ++++++++++++++++----- clipkit/write.py | 8 ++-- tests/integration/test_codon.py | 32 ++++++++++++++- tests/unit/test_msa.py | 72 +++++++++++++++++++++++++++++++++ 6 files changed, 153 insertions(+), 16 deletions(-) diff --git a/clipkit/api.py b/clipkit/api.py index 4db0061..d42506e 100644 --- a/clipkit/api.py +++ b/clipkit/api.py @@ -39,6 +39,7 @@ def clipkit( # override some options not currently available through programmatic interface complement = False + codon = False use_log = False quiet = True @@ -51,6 +52,7 @@ def clipkit( gaps, gap_characters, complement, + codon, TrimmingMode(mode), use_log, quiet, diff --git a/clipkit/clipkit.py b/clipkit/clipkit.py index bdb66c9..a97360d 100644 --- a/clipkit/clipkit.py +++ b/clipkit/clipkit.py @@ -47,6 +47,7 @@ class TrimRun: input_file_format: FileFormat output_file_format: FileFormat gaps: float + codon: bool version: str = current_version @property @@ -67,6 +68,7 @@ def run( gaps: float, gap_characters: Union[list, None], complement: bool, + codon: bool, mode: TrimmingMode, use_log: bool, quiet: bool, @@ -99,7 +101,7 @@ def run( gaps = smart_gap_threshold_determination(alignment, gap_characters) msa = create_msa(alignment, gap_characters) - msa.trim(mode, gap_threshold=gaps) + msa.trim(mode, gap_threshold=gaps, site_positions_to_trim=None, codon=codon) trim_run = TrimRun( alignment, @@ -109,6 +111,7 @@ def run( input_file_format, output_file_format, gaps, + codon, ) return trim_run, msa.stats @@ -123,6 +126,7 @@ def execute( gaps: float, gap_characters: Union[list, None], complement: bool, + codon: bool, mode: TrimmingMode, use_log: bool, quiet: bool, @@ -151,6 +155,7 @@ def execute( gaps, gap_characters, complement, + codon, mode, use_log, quiet, @@ -168,6 +173,7 @@ def execute( trim_run.gap_characters, mode, complement, + codon, use_log, ) diff --git a/clipkit/msa.py b/clipkit/msa.py index 0be6d79..5743ec5 100644 --- a/clipkit/msa.py +++ b/clipkit/msa.py @@ -2,6 +2,7 @@ from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord import numpy as np +from itertools import chain from typing import Union from .modes import TrimmingMode @@ -113,7 +114,12 @@ def trim( site_positions_to_trim = np.array(site_positions_to_trim) if not isinstance(site_positions_to_trim, np.ndarray): raise ValueError("site_positions_to_trim must be a list or np array") - self._site_positions_to_trim = site_positions_to_trim + + self._site_positions_to_trim = ( + self.determine_all_codon_sites_to_trim(site_positions_to_trim) + if codon is True + else site_positions_to_trim + ) else: self._site_positions_to_trim = self.determine_site_positions_to_trim( mode, @@ -210,10 +216,8 @@ def determine_site_positions_to_trim(self, mode, gap_threshold, codon=False): Example: [2, 9] -> [1, 2, 3, 7, 8, 9] """ - sites_to_trim = map( - self.determine_codon_triplet_positions, sites_to_trim - ).flatten() - print(sites_to_trim) + return self.determine_all_codon_sites_to_trim(sites_to_trim) + return sites_to_trim def generate_debug_log_info(self): @@ -234,10 +238,31 @@ def generate_debug_log_info(self): gappyness, ) - def determine_codon_triplet_positions(alignment_position): + def determine_all_codon_sites_to_trim(self, sites_to_trim): + """ + For each position in sites_to_trim we need the full triplet of codon positions. + + Sites to trim -> all codon sites to trim + [2, 8] -> [0, 1, 2, 6, 7, 8] + """ + sites_to_trim_codon = [ + self.determine_codon_triplet_positions(site_pos) + for site_pos in sites_to_trim + ] + flattened_unique_sites = list(set(chain(*sites_to_trim_codon))) + return np.array(flattened_unique_sites) + + def determine_codon_triplet_positions(self, alignment_position): + """ + Block 0 -> [0,1,2], block 1 -> [3,4,5] + + We filter to make sure we are not including any positions out of range + """ block = alignment_position // 3 - remainder = alignment_position % 3 - if remainder: - block += 1 - codon_triplet_index = block * 3 - return [codon_triplet_index - 2, codon_triplet_index - 1, codon_triplet_index] + codon_triplet_index_start = block * 3 + sites = [ + codon_triplet_index_start, + codon_triplet_index_start + 1, + codon_triplet_index_start + 2, + ] + return list(filter(lambda x: x <= self._original_length - 1, sites)) diff --git a/clipkit/write.py b/clipkit/write.py index 0173311..5c37f8b 100644 --- a/clipkit/write.py +++ b/clipkit/write.py @@ -34,6 +34,7 @@ def write_user_args( gap_chars: list, mode: "TrimmingMode", complement: bool, + codon: bool, use_log: bool, ) -> None: if seq_type.value == "nt": @@ -57,6 +58,7 @@ def write_user_args( Gap characters: {gap_chars} Trimming mode: {mode.value} Create complementary output: {complement} + Process as codons: {codon} Create log file: {use_log} """ # noqa ) @@ -77,9 +79,9 @@ def write_output_files_message( ------------------------ | Writing output files | ------------------------ - trimmed alignment: {out_file_name} - complement file: {out_file_name + '.complement' if complement else False} - log file: {out_file_name + '.log' if use_log else False} + Trimmed alignment: {out_file_name} + Complement file: {out_file_name + '.complement' if complement else False} + Log file: {out_file_name + '.log' if use_log else False} """ ) ) diff --git a/tests/integration/test_codon.py b/tests/integration/test_codon.py index 4bffa3e..19e62d4 100644 --- a/tests/integration/test_codon.py +++ b/tests/integration/test_codon.py @@ -26,7 +26,7 @@ def test_simple_codon(self): sequence_type=None, complement=False, codon=True, - gaps=0.2, + gaps=0.8, mode=TrimmingMode.gappy, use_log=False, gap_characters=DEFAULT_NT_GAP_CHARS, @@ -42,3 +42,33 @@ def test_simple_codon(self): output_content = out_file.read() assert expected_content == output_content + + def test_simple_codon_all_trimmed(self): + """ + test codon + usage: clipkit simple.fa -co + """ + output_file = "output/simple.fa_gappy_codon" + kwargs = dict( + input_file=f"{here.parent}/samples/simple.fa", + output_file=output_file, + input_file_format="fasta", + output_file_format="fasta", + sequence_type=None, + complement=False, + codon=True, + gaps=0.1, + mode=TrimmingMode.gappy, + use_log=False, + gap_characters=DEFAULT_NT_GAP_CHARS, + quiet=True, + ) + + execute(**kwargs) + + expected_content = ">1\n>2\n>3\n>4\n>5\n" + + with open(output_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content diff --git a/tests/unit/test_msa.py b/tests/unit/test_msa.py index c340322..485b154 100644 --- a/tests/unit/test_msa.py +++ b/tests/unit/test_msa.py @@ -62,3 +62,75 @@ def test_trim_by_provided_site_positions_list(self): ] ) np.testing.assert_equal(msa.sites_kept, expected_sites_kept) + + @pytest.mark.parametrize( + "sites_to_trim, expected", + [ + ( + [0], + np.array( + [ + ["T", "A", "T"], + ["-", "A", "T"], + ["-", "T", "A"], + ["-", "T", "A"], + ["-", "T", "-"], + ] + ), + ), + ( + [2], + np.array( + [ + ["T", "A", "T"], + ["-", "A", "T"], + ["-", "T", "A"], + ["-", "T", "A"], + ["-", "T", "-"], + ] + ), + ), + ( + [3], + np.array( + [ + ["A", "-", "G"], + ["A", "-", "G"], + ["A", "-", "G"], + ["A", "G", "A"], + ["A", "C", "a"], + ] + ), + ), + ( + [5], + np.array( + [ + ["A", "-", "G"], + ["A", "-", "G"], + ["A", "-", "G"], + ["A", "G", "A"], + ["A", "C", "a"], + ] + ), + ), + ( + [0, 1, 2, 3, 4, 5], + np.array( + [ + [], + [], + [], + [], + [], + ], + dtype=object, + ), + ), + ], + ) + def test_trim_codons(self, sites_to_trim, expected): + bio_msa = get_biopython_msa("tests/unit/examples/simple.fa") + msa = MSA.from_bio_msa(bio_msa) + msa.trim(site_positions_to_trim=sites_to_trim, codon=True) + np.testing.assert_equal(msa.trimmed, expected)