From 5242d1f409fc13c5e48ee4555c238e83d5093ca6 Mon Sep 17 00:00:00 2001 From: JLSteenwyk Date: Mon, 12 Feb 2024 15:18:45 -0800 Subject: [PATCH] added c3 trimming --- clipkit/args_processing.py | 6 ++++ clipkit/modes.py | 1 + clipkit/msa.py | 11 ++++--- clipkit/parser.py | 4 ++- clipkit/version.py | 2 +- tests/integration/expected/simple.fa_c3 | 10 ++++++ tests/integration/samples/simple.fa.out | 10 ++++++ tests/integration/test_c3.py | 44 +++++++++++++++++++++++++ tests/unit/test_args_parsing.py | 6 ++++ 9 files changed, 88 insertions(+), 6 deletions(-) create mode 100644 tests/integration/expected/simple.fa_c3 create mode 100644 tests/integration/samples/simple.fa.out create mode 100644 tests/integration/test_c3.py diff --git a/clipkit/args_processing.py b/clipkit/args_processing.py index 6226a92..86654c5 100644 --- a/clipkit/args_processing.py +++ b/clipkit/args_processing.py @@ -36,6 +36,12 @@ def process_args(args) -> dict: quiet = args.quiet or False sequence_type = SeqType(args.sequence_type.lower()) if args.sequence_type else None + if codon and mode == TrimmingMode.c3: + logger.warning( + "C3 and codon-based trimming are incompatible.\nCodon-based trimming removes whole codons while C3 removes every third codon position." + ) + sys.exit() + return dict( input_file=input_file, output_file=output_file, diff --git a/clipkit/modes.py b/clipkit/modes.py index e4001a2..3f3b6fa 100644 --- a/clipkit/modes.py +++ b/clipkit/modes.py @@ -16,3 +16,4 @@ class TrimmingMode(Enum): kpic = "kpic" kpic_gappy = "kpic-gappy" kpic_smart_gap = "kpic-smart-gap" + c3 = "c3" diff --git a/clipkit/msa.py b/clipkit/msa.py index 5743ec5..b7bd563 100644 --- a/clipkit/msa.py +++ b/clipkit/msa.py @@ -209,9 +209,12 @@ def determine_site_positions_to_trim(self, mode, gap_threshold, codon=False): (sites_to_trim_gaps_based, sites_to_trim_classification_based) ) ) - - if codon: + elif mode == TrimmingMode.c3: + sites_to_trim = np.arange(3, self._original_length + 1, 3) - 1 + if codon and mode != TrimmingMode.c3: """ + NOTE: ignoring c3 mode otherwise we would ALWAYS trim the entire file by definition. + For each position in sites_to_trim we need the full triplet of codon positions tuple. Example: [2, 9] -> [1, 2, 3, 7, 8, 9] @@ -258,8 +261,8 @@ def determine_codon_triplet_positions(self, alignment_position): We filter to make sure we are not including any positions out of range """ - block = alignment_position // 3 - codon_triplet_index_start = block * 3 + block = alignment_position // self._codon_size + codon_triplet_index_start = block * self._codon_size sites = [ codon_triplet_index_start, codon_triplet_index_start + 1, diff --git a/clipkit/parser.py b/clipkit/parser.py index 23ddacd..f9f86a4 100644 --- a/clipkit/parser.py +++ b/clipkit/parser.py @@ -73,7 +73,8 @@ def create_parser() -> ArgumentParser: kpic-gappy, kpi, kpi-smart-gap, - kpi-gappy> + kpi-gappy, + c3> -g, --gaps specifies gaps threshold (default: 0.9) @@ -116,6 +117,7 @@ def create_parser() -> ArgumentParser: kpi: keep only parsimony informative sites kpi-smart-gap: a combination of kpi- and smart-gap-based trimming kpi-gappy: a combination of kpi- and gappy-based trimming + c3: remove every third codon position Gaps Positions with gappyness greater than threshold will be trimmed. diff --git a/clipkit/version.py b/clipkit/version.py index e835b9d..8a124bf 100644 --- a/clipkit/version.py +++ b/clipkit/version.py @@ -1 +1 @@ -__version__ = "2.1.3" +__version__ = "2.2.0" diff --git a/tests/integration/expected/simple.fa_c3 b/tests/integration/expected/simple.fa_c3 new file mode 100644 index 0000000..22fee15 --- /dev/null +++ b/tests/integration/expected/simple.fa_c3 @@ -0,0 +1,10 @@ +>1 +A-TA +>2 +A--A +>3 +A--T +>4 +AG-T +>5 +AC-T diff --git a/tests/integration/samples/simple.fa.out b/tests/integration/samples/simple.fa.out new file mode 100644 index 0000000..22fee15 --- /dev/null +++ b/tests/integration/samples/simple.fa.out @@ -0,0 +1,10 @@ +>1 +A-TA +>2 +A--A +>3 +A--T +>4 +AG-T +>5 +AC-T diff --git a/tests/integration/test_c3.py b/tests/integration/test_c3.py new file mode 100644 index 0000000..f781862 --- /dev/null +++ b/tests/integration/test_c3.py @@ -0,0 +1,44 @@ +import pytest +from pathlib import Path + +from clipkit.clipkit import execute +from clipkit.files import FileFormat +from clipkit.modes import TrimmingMode +from clipkit.settings import DEFAULT_AA_GAP_CHARS, DEFAULT_NT_GAP_CHARS + +here = Path(__file__) + + +@pytest.mark.integration +class TestC3Out(object): + def test_simple_c3(self): + """ + test codon + usage: clipkit simple.fa c3 + """ + output_file = "output/simple.fa_c3" + + kwargs = dict( + input_file=f"{here.parent}/samples/simple.fa", + output_file=output_file, + input_file_format="fasta", + output_file_format="fasta", + sequence_type=None, + complement=False, + codon=False, + gaps=None, + mode=TrimmingMode.c3, + use_log=False, + gap_characters=DEFAULT_NT_GAP_CHARS, + quiet=True, + ) + + execute(**kwargs) + + with open(f"{here.parent}/expected/simple.fa_c3", "r") as expected: + expected_content = expected.read() + + with open(output_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content diff --git a/tests/unit/test_args_parsing.py b/tests/unit/test_args_parsing.py index 7299ee4..af0030c 100644 --- a/tests/unit/test_args_parsing.py +++ b/tests/unit/test_args_parsing.py @@ -110,3 +110,9 @@ def test_process_args_expected_keywords(self, args): "quiet", ] assert sorted(res.keys()) == sorted(expected_keys) + + def test_incompatible_codon_args(self, args): + args.codon = True + args.mode = TrimmingMode.c3 + with pytest.raises(SystemExit): + process_args(args)