Skip to content

Commit

Permalink
initial commit of codon-based trimming
Browse files Browse the repository at this point in the history
  • Loading branch information
JLSteenwyk committed Feb 5, 2024
1 parent de1855c commit b2bb0da
Show file tree
Hide file tree
Showing 17 changed files with 189 additions and 2 deletions.
2 changes: 2 additions & 0 deletions clipkit/args_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def process_args(args) -> dict:

# assign optional arguments
complement = args.complementary or False
codon = args.codon or False
mode = TrimmingMode(args.mode) if args.mode else TrimmingMode.smart_gap
gaps = float(args.gaps) if args.gaps is not None else 0.9
gap_characters = (
Expand All @@ -40,6 +41,7 @@ def process_args(args) -> dict:
output_file=output_file,
input_file_format=args.input_file_format,
output_file_format=args.output_file_format,
codon=codon,
sequence_type=sequence_type,
complement=complement,
gaps=gaps,
Expand Down
26 changes: 24 additions & 2 deletions clipkit/msa.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def __init__(
self._site_classification_types = None
self._column_character_frequencies = None
self._gap_chars = gap_chars
self._codon_size = 3

@staticmethod
def from_bio_msa(alignment: MultipleSeqAlignment, gap_chars=None) -> "MSA":
Expand Down Expand Up @@ -105,6 +106,7 @@ def trim(
mode: TrimmingMode = TrimmingMode.smart_gap,
gap_threshold=None,
site_positions_to_trim=None,
codon=False,
) -> np.array:
if site_positions_to_trim is not None:
if isinstance(site_positions_to_trim, list):
Expand All @@ -114,7 +116,9 @@ def trim(
self._site_positions_to_trim = site_positions_to_trim
else:
self._site_positions_to_trim = self.determine_site_positions_to_trim(
mode, gap_threshold
mode,
gap_threshold,
codon,
)
self._site_positions_to_keep = np.delete(
np.arange(self._original_length), self._site_positions_to_trim
Expand Down Expand Up @@ -154,7 +158,7 @@ def site_classification_types(self):
self._site_classification_types = site_classification_types
return self._site_classification_types

def determine_site_positions_to_trim(self, mode, gap_threshold):
def determine_site_positions_to_trim(self, mode, gap_threshold, codon=False):
if mode in (TrimmingMode.gappy, TrimmingMode.smart_gap):
sites_to_trim = np.where(self.site_gappyness >= gap_threshold)[0]
elif mode == TrimmingMode.kpi:
Expand Down Expand Up @@ -200,6 +204,16 @@ def determine_site_positions_to_trim(self, mode, gap_threshold):
)
)

if codon:
"""
For each position in sites_to_trim we need the full triplet of codon positions tuple.
Example:
[2, 9] -> [1, 2, 3, 7, 8, 9]
"""
sites_to_trim = map(
self.determine_codon_triplet_positions, sites_to_trim
).flatten()
print(sites_to_trim)
return sites_to_trim

def generate_debug_log_info(self):
Expand All @@ -219,3 +233,11 @@ def generate_debug_log_info(self):
self.site_classification_types[idx],
gappyness,
)

def determine_codon_triplet_positions(alignment_position):
block = alignment_position // 3
remainder = alignment_position % 3
if remainder:
block += 1
codon_triplet_index = block * 3
return [codon_triplet_index - 2, codon_triplet_index - 1, codon_triplet_index]
14 changes: 14 additions & 0 deletions clipkit/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ def create_parser() -> ArgumentParser:
-c, --complementary creates complementary alignment of trimmed sequences
(input file named with '.log' suffix)
-co, --codon conduct trimming of codons
-q, --quiet disables all logging to stdout
-h, --help help message
Expand Down Expand Up @@ -151,6 +153,10 @@ def create_parser() -> ArgumentParser:
Complementary
Creates an alignment file of only the trimmed sequences
Codon
Trims codon-based alignments. If one position in a codon should be trimmed, the whole
codon will be trimmed.
""" # noqa
),
)
Expand Down Expand Up @@ -255,4 +261,12 @@ def create_parser() -> ArgumentParser:
help=SUPPRESS,
)

optional.add_argument(
"-co",
"--codon",
action="store_true",
required=False,
help=SUPPRESS,
)

return parser
10 changes: 10 additions & 0 deletions tests/integration/expected/simple.fa_gappy_codon
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>1
A-G
>2
A-G
>3
A-G
>4
AGA
>5
ACa
44 changes: 44 additions & 0 deletions tests/integration/test_codon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pytest
from pathlib import Path

from clipkit.clipkit import execute
from clipkit.files import FileFormat
from clipkit.modes import TrimmingMode
from clipkit.settings import DEFAULT_AA_GAP_CHARS, DEFAULT_NT_GAP_CHARS

here = Path(__file__)


@pytest.mark.integration
class TestCodonOut(object):
def test_simple_codon(self):
"""
test codon
usage: clipkit simple.fa -co
"""
output_file = "output/simple.fa_gappy_codon"

kwargs = dict(
input_file=f"{here.parent}/samples/simple.fa",
output_file=output_file,
input_file_format="fasta",
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=True,
gaps=0.2,
mode=TrimmingMode.gappy,
use_log=False,
gap_characters=DEFAULT_NT_GAP_CHARS,
quiet=True,
)

execute(**kwargs)

with open(f"{here.parent}/expected/simple.fa_gappy_codon", "r") as expected:
expected_content = expected.read()

with open(output_file, "r") as out_file:
output_content = out_file.read()

assert expected_content == output_content
4 changes: 4 additions & 0 deletions tests/integration/test_complementary_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def test_simple_complement(self):
output_file_format="fasta",
sequence_type=None,
complement=True,
codon=False,
gaps=0.9,
mode=TrimmingMode.gappy,
use_log=False,
Expand Down Expand Up @@ -60,6 +61,7 @@ def test_simple_long_description_complement(self):
output_file_format="fasta",
sequence_type=None,
complement=True,
codon=False,
gaps=0.9,
mode=TrimmingMode.gappy,
use_log=False,
Expand Down Expand Up @@ -94,6 +96,7 @@ def test_12_YIL115C_Anc_2_253_aa_aln_complement(self):
output_file_format="fasta",
sequence_type=None,
complement=True,
codon=False,
gaps=0.9,
mode=TrimmingMode.gappy,
use_log=False,
Expand Down Expand Up @@ -128,6 +131,7 @@ def test_EOG091N44M8_aa_complement(self):
output_file_format="fasta",
sequence_type=None,
complement=True,
codon=False,
gaps=0.9,
mode=TrimmingMode.gappy,
use_log=False,
Expand Down
12 changes: 12 additions & 0 deletions tests/integration/test_gappy_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def test_simple_no_change(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.9,
mode=TrimmingMode.gappy,
use_log=False,
Expand Down Expand Up @@ -57,6 +58,7 @@ def test_simple_no_change_long_description(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.9,
mode=TrimmingMode.gappy,
use_log=False,
Expand Down Expand Up @@ -88,6 +90,7 @@ def test_12_YIL115C_Anc_2_253_codon_aln(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.9,
mode=TrimmingMode.gappy,
use_log=False,
Expand Down Expand Up @@ -121,6 +124,7 @@ def test_12_YIL115C_Anc_2_253_aa_aln(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.9,
mode=TrimmingMode.gappy,
use_log=False,
Expand Down Expand Up @@ -154,6 +158,7 @@ def test_24_ENSG00000163519_aa_aln(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.9,
mode=TrimmingMode.gappy,
use_log=False,
Expand Down Expand Up @@ -187,6 +192,7 @@ def test_24_ENSG00000163519_codon_aln(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.9,
mode=TrimmingMode.gappy,
use_log=False,
Expand Down Expand Up @@ -220,6 +226,7 @@ def test_EOG091N44M8_aa(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.9,
mode=TrimmingMode.gappy,
use_log=False,
Expand Down Expand Up @@ -251,6 +258,7 @@ def test_EOG091N44M8_nt(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.9,
mode=TrimmingMode.gappy,
use_log=False,
Expand Down Expand Up @@ -316,6 +324,7 @@ def test_simple(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.2,
mode=TrimmingMode.gappy,
use_log=False,
Expand Down Expand Up @@ -349,6 +358,7 @@ def test_12_YIL115C_Anc_2_253_codon_aln(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.3,
mode=TrimmingMode.gappy,
use_log=False,
Expand Down Expand Up @@ -383,6 +393,7 @@ def test_24_ENSG00000163519_codon_aln(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.4,
mode=TrimmingMode.gappy,
use_log=False,
Expand Down Expand Up @@ -417,6 +428,7 @@ def test_EOG091N44M8_nt(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.1,
mode=TrimmingMode.gappy,
use_log=False,
Expand Down
12 changes: 12 additions & 0 deletions tests/integration/test_kpi_gappy_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def test_simple(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.9,
mode=TrimmingMode.kpi_gappy,
use_log=False,
Expand Down Expand Up @@ -55,6 +56,7 @@ def test_simple_long_description(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.9,
mode=TrimmingMode.kpi_gappy,
use_log=False,
Expand Down Expand Up @@ -88,6 +90,7 @@ def test_12_YIL115C_Anc_2_253_codon_aln(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.9,
mode=TrimmingMode.kpi_gappy,
use_log=False,
Expand Down Expand Up @@ -122,6 +125,7 @@ def test_12_YIL115C_Anc_2_253_aa_aln(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.9,
mode=TrimmingMode.kpi_gappy,
use_log=False,
Expand Down Expand Up @@ -155,6 +159,7 @@ def test_24_ENSG00000163519_aa_aln(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.9,
mode=TrimmingMode.kpi_gappy,
use_log=False,
Expand Down Expand Up @@ -188,6 +193,7 @@ def test_24_ENSG00000163519_codon_aln(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.9,
mode=TrimmingMode.kpi_gappy,
use_log=False,
Expand Down Expand Up @@ -221,6 +227,7 @@ def test_EOG091N44M8_aa(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.9,
mode=TrimmingMode.kpi_gappy,
use_log=False,
Expand Down Expand Up @@ -254,6 +261,7 @@ def test_EOG091N44M8_nt(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.9,
mode=TrimmingMode.kpi_gappy,
use_log=False,
Expand Down Expand Up @@ -321,6 +329,7 @@ def test_simple(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.2,
mode=TrimmingMode.kpi_gappy,
use_log=False,
Expand Down Expand Up @@ -354,6 +363,7 @@ def test_12_YIL115C_Anc_2_253_codon_aln(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.3,
mode=TrimmingMode.kpi_gappy,
use_log=False,
Expand Down Expand Up @@ -388,6 +398,7 @@ def test_24_ENSG00000163519_codon_aln_custom_gap(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.4,
mode=TrimmingMode.kpi_gappy,
use_log=False,
Expand Down Expand Up @@ -422,6 +433,7 @@ def test_EOG091N44M8_nt(self):
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=False,
gaps=0.1,
mode=TrimmingMode.kpi_gappy,
use_log=False,
Expand Down
Loading

0 comments on commit b2bb0da

Please sign in to comment.