From be991d775c38acd10b7dab21374bf62fe57d4e2e Mon Sep 17 00:00:00 2001 From: "Jacob L. Steenwyk" Date: Tue, 7 Dec 2021 10:21:20 -0600 Subject: [PATCH] version bump. entire sequence description, rather than just the identifier, is kept in the output files of ClipKIT --- change_log.txt | 9 +- clipkit/helpers.py | 7 +- clipkit/modes.py | 4 +- clipkit/version.py | 2 +- docs/change_log/index.rst | 6 + .../expected/simple_long_description.clustal | 10 ++ .../simple_long_description.fa.clipkit | 10 ++ .../simple_long_description.fa.clipkit.log | 6 + .../expected/simple_long_description.fa_gappy | 10 ++ ...imple_long_description.fa_gappy.complement | 5 + .../expected/simple_long_description.fa_kpi | 10 ++ .../simple_long_description.fa_kpi_gappy | 10 ++ .../simple_long_description.fa_kpi_smart_gaps | 10 ++ .../expected/simple_long_description.fa_kpic | 10 ++ .../simple_long_description.fa_kpic_gappy | 10 ++ ...simple_long_description.fa_kpic_smart_gaps | 10 ++ .../simple_long_description.fa_smart_gaps | 10 ++ .../expected/simple_long_description.maf | 10 ++ .../expected/simple_long_description.mauve | 17 ++ .../expected/simple_long_description.phylip | 6 + .../simple_long_description.phylip-relaxed | 0 .../simple_long_description.phylip-sequential | 6 + .../simple_long_description.stockholm | 13 ++ .../samples/simple_long_description.fa | 10 ++ .../simple_long_description.fa.clipkit | 10 ++ .../integration/test_complementary_output.py | 31 ++++ tests/integration/test_gappy_mode.py | 79 +++----- tests/integration/test_kpi_gappy_mode.py | 58 +++--- tests/integration/test_kpi_mode.py | 57 +++--- tests/integration/test_kpi_smart_gap_mode.py | 60 +++---- tests/integration/test_kpic_gappy_mode.py | 57 +++--- tests/integration/test_kpic_mode.py | 57 +++--- tests/integration/test_kpic_smart_gap_mode.py | 60 +++---- tests/integration/test_smart_gap_mode.py | 60 +++---- ...riting_to_different_output_file_formats.py | 168 ++++++++++++++++++ tests/integration/test_writing_to_log_file.py | 28 +++ tests/unit/test_modes.py | 20 +-- 37 files changed, 660 insertions(+), 286 deletions(-) create mode 100644 tests/integration/expected/simple_long_description.clustal create mode 100644 tests/integration/expected/simple_long_description.fa.clipkit create mode 100644 tests/integration/expected/simple_long_description.fa.clipkit.log create mode 100644 tests/integration/expected/simple_long_description.fa_gappy create mode 100644 tests/integration/expected/simple_long_description.fa_gappy.complement create mode 100644 tests/integration/expected/simple_long_description.fa_kpi create mode 100644 tests/integration/expected/simple_long_description.fa_kpi_gappy create mode 100644 tests/integration/expected/simple_long_description.fa_kpi_smart_gaps create mode 100644 tests/integration/expected/simple_long_description.fa_kpic create mode 100644 tests/integration/expected/simple_long_description.fa_kpic_gappy create mode 100644 tests/integration/expected/simple_long_description.fa_kpic_smart_gaps create mode 100644 tests/integration/expected/simple_long_description.fa_smart_gaps create mode 100644 tests/integration/expected/simple_long_description.maf create mode 100644 tests/integration/expected/simple_long_description.mauve create mode 100644 tests/integration/expected/simple_long_description.phylip create mode 100644 tests/integration/expected/simple_long_description.phylip-relaxed create mode 100644 tests/integration/expected/simple_long_description.phylip-sequential create mode 100644 tests/integration/expected/simple_long_description.stockholm create mode 100644 tests/integration/samples/simple_long_description.fa create mode 100644 tests/integration/samples/simple_long_description.fa.clipkit diff --git a/change_log.txt b/change_log.txt index 71b5b33..174dea3 100644 --- a/change_log.txt +++ b/change_log.txt @@ -1,8 +1,11 @@ Major changes to ClipKIT are summarized here. +1.3.0 +long description of sequences, rather than identifiers, are kept in the ClipKIT output + +1.1.5 +carried over code base to biopython, v1.79 + 1.1.0 smart-gap trimming is introduced and is now the default trimming approach used in ClipKIT. smart-gap trimming is a dynamic approach to determine the appropriate gaps threshold for an alignment. - -1.1.5 -carried over code base to biopython, v1.79 \ No newline at end of file diff --git a/clipkit/helpers.py b/clipkit/helpers.py index 160f790..f9a03df 100644 --- a/clipkit/helpers.py +++ b/clipkit/helpers.py @@ -106,12 +106,11 @@ def populate_empty_keepD_and_trimD(alignment): biopython multiple sequence alignment object """ keepD = {} - alignment_length = alignment.get_alignment_length() - for entry in alignment: - keepD[entry.id] = np.zeros([alignment_length], dtype=bytes) trimD = {} + alignment_length = alignment.get_alignment_length() for entry in alignment: - trimD[entry.id] = np.zeros([alignment_length], dtype=bytes) + keepD[entry.description] = np.zeros([alignment_length], dtype=bytes) + trimD[entry.description] = np.zeros([alignment_length], dtype=bytes) return keepD, trimD diff --git a/clipkit/modes.py b/clipkit/modes.py index eef6015..c7524ea 100644 --- a/clipkit/modes.py +++ b/clipkit/modes.py @@ -80,7 +80,7 @@ def trim( # save to keepD if shouldKeep(mode, parsimony_informative, constant_site, gappyness, gaps): for entry in alignment: - keepD[entry.id][alignment_position] = entry.seq._data[alignment_position:alignment_position+1] + keepD[entry.description][alignment_position] = entry.seq._data[alignment_position:alignment_position+1] if use_log: if constant_site: logger.debug(f"{str(alignment_position + 1)} keep Const {gappyness}") @@ -93,7 +93,7 @@ def trim( # save to trimD else: for entry in alignment: - trimD[entry.id][alignment_position] = entry.seq._data[alignment_position:alignment_position+1] + trimD[entry.description][alignment_position] = entry.seq._data[alignment_position:alignment_position+1] if use_log: if constant_site: logger.debug(f"{str(alignment_position + 1)} trim Const {gappyness}") diff --git a/clipkit/version.py b/clipkit/version.py index e916218..f94ba41 100644 --- a/clipkit/version.py +++ b/clipkit/version.py @@ -1 +1 @@ -__version__ = '1.2.0' \ No newline at end of file +__version__ = '1.3.0' \ No newline at end of file diff --git a/docs/change_log/index.rst b/docs/change_log/index.rst index 4e53104..c30f170 100644 --- a/docs/change_log/index.rst +++ b/docs/change_log/index.rst @@ -8,6 +8,12 @@ Change log Major changes to ClipKIT are summarized here. +**1.3.0** +long description of sequences, rather than identifiers, are kept in the ClipKIT output + +**1.1.5** +carried over code base to biopython, v1.79 + **1.1.0:** smart-gap trimming is introduced and is now the default trimming approach used in ClipKIT. smart-gap trimming is a dynamic approach to determine the appropriate gaps threshold for an alignment. diff --git a/tests/integration/expected/simple_long_description.clustal b/tests/integration/expected/simple_long_description.clustal new file mode 100644 index 0000000..fe714b4 --- /dev/null +++ b/tests/integration/expected/simple_long_description.clustal @@ -0,0 +1,10 @@ +CLUSTAL X (1.81) multiple sequence alignment + + +1_extra_info A-GTAT +2_extra_info A-G-AT +3_extra_info A-G-TA +4_extra_info AGA-TA +5_extra_info ACa-T- + + diff --git a/tests/integration/expected/simple_long_description.fa.clipkit b/tests/integration/expected/simple_long_description.fa.clipkit new file mode 100644 index 0000000..054bf1a --- /dev/null +++ b/tests/integration/expected/simple_long_description.fa.clipkit @@ -0,0 +1,10 @@ +>1 extra_info +A-GTAT +>2 extra_info +A-G-AT +>3 extra_info +A-G-TA +>4 extra_info +AGA-TA +>5 extra_info +ACa-T- diff --git a/tests/integration/expected/simple_long_description.fa.clipkit.log b/tests/integration/expected/simple_long_description.fa.clipkit.log new file mode 100644 index 0000000..e6b1459 --- /dev/null +++ b/tests/integration/expected/simple_long_description.fa.clipkit.log @@ -0,0 +1,6 @@ +1 keep Const 0.0 +2 keep nConst,nPI 0.6 +3 keep PI 0.0 +4 keep nConst,nPI 0.8 +5 keep PI 0.0 +6 keep PI 0.2 diff --git a/tests/integration/expected/simple_long_description.fa_gappy b/tests/integration/expected/simple_long_description.fa_gappy new file mode 100644 index 0000000..054bf1a --- /dev/null +++ b/tests/integration/expected/simple_long_description.fa_gappy @@ -0,0 +1,10 @@ +>1 extra_info +A-GTAT +>2 extra_info +A-G-AT +>3 extra_info +A-G-TA +>4 extra_info +AGA-TA +>5 extra_info +ACa-T- diff --git a/tests/integration/expected/simple_long_description.fa_gappy.complement b/tests/integration/expected/simple_long_description.fa_gappy.complement new file mode 100644 index 0000000..fa9fd1f --- /dev/null +++ b/tests/integration/expected/simple_long_description.fa_gappy.complement @@ -0,0 +1,5 @@ +>1 extra_info +>2 extra_info +>3 extra_info +>4 extra_info +>5 extra_info diff --git a/tests/integration/expected/simple_long_description.fa_kpi b/tests/integration/expected/simple_long_description.fa_kpi new file mode 100644 index 0000000..75d2849 --- /dev/null +++ b/tests/integration/expected/simple_long_description.fa_kpi @@ -0,0 +1,10 @@ +>1 extra_info +GAT +>2 extra_info +GAT +>3 extra_info +GTA +>4 extra_info +ATA +>5 extra_info +aT- diff --git a/tests/integration/expected/simple_long_description.fa_kpi_gappy b/tests/integration/expected/simple_long_description.fa_kpi_gappy new file mode 100644 index 0000000..75d2849 --- /dev/null +++ b/tests/integration/expected/simple_long_description.fa_kpi_gappy @@ -0,0 +1,10 @@ +>1 extra_info +GAT +>2 extra_info +GAT +>3 extra_info +GTA +>4 extra_info +ATA +>5 extra_info +aT- diff --git a/tests/integration/expected/simple_long_description.fa_kpi_smart_gaps b/tests/integration/expected/simple_long_description.fa_kpi_smart_gaps new file mode 100644 index 0000000..75d2849 --- /dev/null +++ b/tests/integration/expected/simple_long_description.fa_kpi_smart_gaps @@ -0,0 +1,10 @@ +>1 extra_info +GAT +>2 extra_info +GAT +>3 extra_info +GTA +>4 extra_info +ATA +>5 extra_info +aT- diff --git a/tests/integration/expected/simple_long_description.fa_kpic b/tests/integration/expected/simple_long_description.fa_kpic new file mode 100644 index 0000000..97a1931 --- /dev/null +++ b/tests/integration/expected/simple_long_description.fa_kpic @@ -0,0 +1,10 @@ +>1 extra_info +AGAT +>2 extra_info +AGAT +>3 extra_info +AGTA +>4 extra_info +AATA +>5 extra_info +AaT- diff --git a/tests/integration/expected/simple_long_description.fa_kpic_gappy b/tests/integration/expected/simple_long_description.fa_kpic_gappy new file mode 100644 index 0000000..97a1931 --- /dev/null +++ b/tests/integration/expected/simple_long_description.fa_kpic_gappy @@ -0,0 +1,10 @@ +>1 extra_info +AGAT +>2 extra_info +AGAT +>3 extra_info +AGTA +>4 extra_info +AATA +>5 extra_info +AaT- diff --git a/tests/integration/expected/simple_long_description.fa_kpic_smart_gaps b/tests/integration/expected/simple_long_description.fa_kpic_smart_gaps new file mode 100644 index 0000000..97a1931 --- /dev/null +++ b/tests/integration/expected/simple_long_description.fa_kpic_smart_gaps @@ -0,0 +1,10 @@ +>1 extra_info +AGAT +>2 extra_info +AGAT +>3 extra_info +AGTA +>4 extra_info +AATA +>5 extra_info +AaT- diff --git a/tests/integration/expected/simple_long_description.fa_smart_gaps b/tests/integration/expected/simple_long_description.fa_smart_gaps new file mode 100644 index 0000000..8823fa6 --- /dev/null +++ b/tests/integration/expected/simple_long_description.fa_smart_gaps @@ -0,0 +1,10 @@ +>1 extra_info +A-GAT +>2 extra_info +A-GAT +>3 extra_info +A-GTA +>4 extra_info +AGATA +>5 extra_info +ACaT- diff --git a/tests/integration/expected/simple_long_description.maf b/tests/integration/expected/simple_long_description.maf new file mode 100644 index 0000000..a9078d8 --- /dev/null +++ b/tests/integration/expected/simple_long_description.maf @@ -0,0 +1,10 @@ +##maf version=1 scoring=none +# generated by Biopython + +a score=0.00 +s 1_extra_info 0 5 + 0 A-GTAT +s 2_extra_info 0 4 + 0 A-G-AT +s 3_extra_info 0 4 + 0 A-G-TA +s 4_extra_info 0 5 + 0 AGA-TA +s 5_extra_info 0 4 + 0 ACa-T- + diff --git a/tests/integration/expected/simple_long_description.mauve b/tests/integration/expected/simple_long_description.mauve new file mode 100644 index 0000000..2450ae0 --- /dev/null +++ b/tests/integration/expected/simple_long_description.mauve @@ -0,0 +1,17 @@ +#FormatVersion Mauve1 +#Sequence1Entry 1 +#Sequence2Entry 2 +#Sequence3Entry 3 +#Sequence4Entry 4 +#Sequence5Entry 5 +> 1:0-0 + .fa # 1 extra_info +A-GTAT +> 2:0-0 + .fa # 2 extra_info +A-G-AT +> 3:0-0 + .fa # 3 extra_info +A-G-TA +> 4:0-0 + .fa # 4 extra_info +AGA-TA +> 5:0-0 + .fa # 5 extra_info +ACa-T- += diff --git a/tests/integration/expected/simple_long_description.phylip b/tests/integration/expected/simple_long_description.phylip new file mode 100644 index 0000000..82ad9fb --- /dev/null +++ b/tests/integration/expected/simple_long_description.phylip @@ -0,0 +1,6 @@ + 5 6 +1 extra_in A-GTAT +2 extra_in A-G-AT +3 extra_in A-G-TA +4 extra_in AGA-TA +5 extra_in ACa-T- diff --git a/tests/integration/expected/simple_long_description.phylip-relaxed b/tests/integration/expected/simple_long_description.phylip-relaxed new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/expected/simple_long_description.phylip-sequential b/tests/integration/expected/simple_long_description.phylip-sequential new file mode 100644 index 0000000..9ba1a22 --- /dev/null +++ b/tests/integration/expected/simple_long_description.phylip-sequential @@ -0,0 +1,6 @@ + 5 6 +1 extra_inA-GTAT +2 extra_inA-G-AT +3 extra_inA-G-TA +4 extra_inAGA-TA +5 extra_inACa-T- diff --git a/tests/integration/expected/simple_long_description.stockholm b/tests/integration/expected/simple_long_description.stockholm new file mode 100644 index 0000000..ae5e941 --- /dev/null +++ b/tests/integration/expected/simple_long_description.stockholm @@ -0,0 +1,13 @@ +# STOCKHOLM 1.0 +#=GF SQ 5 +1_extra_info A-GTAT +#=GS 1_extra_info AC 1 extra_info +2_extra_info A-G-AT +#=GS 2_extra_info AC 2 extra_info +3_extra_info A-G-TA +#=GS 3_extra_info AC 3 extra_info +4_extra_info AGA-TA +#=GS 4_extra_info AC 4 extra_info +5_extra_info ACa-T- +#=GS 5_extra_info AC 5 extra_info +// diff --git a/tests/integration/samples/simple_long_description.fa b/tests/integration/samples/simple_long_description.fa new file mode 100644 index 0000000..054bf1a --- /dev/null +++ b/tests/integration/samples/simple_long_description.fa @@ -0,0 +1,10 @@ +>1 extra_info +A-GTAT +>2 extra_info +A-G-AT +>3 extra_info +A-G-TA +>4 extra_info +AGA-TA +>5 extra_info +ACa-T- diff --git a/tests/integration/samples/simple_long_description.fa.clipkit b/tests/integration/samples/simple_long_description.fa.clipkit new file mode 100644 index 0000000..8823fa6 --- /dev/null +++ b/tests/integration/samples/simple_long_description.fa.clipkit @@ -0,0 +1,10 @@ +>1 extra_info +A-GAT +>2 extra_info +A-GAT +>3 extra_info +A-GTA +>4 extra_info +AGATA +>5 extra_info +ACaT- diff --git a/tests/integration/test_complementary_output.py b/tests/integration/test_complementary_output.py index f0dd02e..c036bc3 100644 --- a/tests/integration/test_complementary_output.py +++ b/tests/integration/test_complementary_output.py @@ -41,6 +41,37 @@ def test_simple_complement(self): assert expected_content == output_content + def test_simple_long_description_complement(self): + """ + test complementary output file with a simple case + usage: clipkit simple_long_description.fa -c + """ + output_file = "output/simple_long_description.fa_gappy" + complement_out_file = f"{output_file}.complement" + + kwargs = dict( + input_file=f"{here.parent}/samples/simple_long_description.fa", + output_file=output_file, + input_file_format='fasta', + output_file_format='fasta', + complement=True, + gaps=0.9, + mode=TrimmingMode.gappy, + use_log=False, + ) + + execute(**kwargs) + + with open( + f"{here.parent}/expected/simple_long_description.fa_gappy.complement", "r" + ) as expected: + expected_content = expected.read() + + with open(complement_out_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content + def test_12_YIL115C_Anc_2_253_aa_aln_complement(self): """ test complementary output file for amino acid yeast sequences diff --git a/tests/integration/test_gappy_mode.py b/tests/integration/test_gappy_mode.py index e94c744..7aed3c4 100644 --- a/tests/integration/test_gappy_mode.py +++ b/tests/integration/test_gappy_mode.py @@ -38,6 +38,34 @@ def test_simple_no_change(self): assert expected_content == output_content + def test_simple_no_change_long_description(self): + """ + test gappy where no changes are expected in the resulting + output alignment. + usage: clipkit simple.fa + """ + input_file = f"{here.parent}/samples/simple_long_description.fa" + output_file = "output/simple_long_description.fa.clipkit" + kwargs = dict( + input_file=input_file, + output_file=output_file, + input_file_format='fasta', + output_file_format='fasta', + complement=False, + gaps=0.9, + mode=TrimmingMode.gappy, + use_log=False, + ) + execute(**kwargs) + + with open(input_file, "r") as expected: + expected_content = expected.read() + + with open(output_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content + def test_12_YIL115C_Anc_2_253_codon_aln(self): """ test gappy with codon alignment of yeast sequences @@ -45,8 +73,6 @@ def test_12_YIL115C_Anc_2_253_codon_aln(self): """ input_file = f"{here.parent}/samples/12_YIL115C_Anc_2.253_codon_aln.fasta" output_file = "output/12_YIL115C_Anc_2.253_codon_aln.fasta.clipkit" - in_file_format = 'fasta' - out_file_format = 'fasta' kwargs = dict( input_file=input_file, @@ -77,8 +103,6 @@ def test_12_YIL115C_Anc_2_253_aa_aln(self): """ input_file = f"{here.parent}/samples/12_YIL115C_Anc_2.253_aa_aln.fasta" output_file = "output/12_YIL115C_Anc_2.253_aa_aln.fasta.clipkit" - in_file_format = 'fasta' - out_file_format = 'fasta' kwargs = dict( input_file=input_file, @@ -109,8 +133,6 @@ def test_24_ENSG00000163519_aa_aln(self): """ input_file = f"{here.parent}/samples/24_ENSG00000163519_aa_aln.fasta" output_file = "output/24_ENSG00000163519_aa_aln.fasta.clipkit" - in_file_format = 'fasta' - out_file_format = 'fasta' kwargs = dict( input_file=input_file, @@ -141,8 +163,6 @@ def test_24_ENSG00000163519_codon_aln(self): """ input_file = f"{here.parent}/samples/24_ENSG00000163519_codon_aln.fasta" output_file = "output/24_ENSG00000163519_codon_aln.fasta.clipkit" - in_file_format = 'fasta' - out_file_format = 'fasta' kwargs = dict( input_file=input_file, @@ -173,8 +193,6 @@ def test_EOG091N44M8_aa(self): """ input_file = f"{here.parent}/samples/EOG091N44M8_aa.fa" output_file = "output/EOG091N44M8_aa.fa.clipkit" - in_file_format = 'fasta' - out_file_format = 'fasta' kwargs = dict( input_file=input_file, @@ -203,8 +221,6 @@ def test_EOG091N44M8_nt(self): """ input_file = f"{here.parent}/samples/EOG091N44M8_nt.fa" output_file = "output/EOG091N44M8_nt.fa.clipkit" - in_file_format = 'fasta' - out_file_format = 'fasta' kwargs = dict( input_file=input_file, @@ -227,38 +243,6 @@ def test_EOG091N44M8_nt(self): assert expected_content == output_content @pytest.mark.slow - def test_EOG092C0CZK_aa(self): - """ - test gappy with amino alignment of fungal sequences - usage: clipkit EOG092C0CZK_aa_aln.fasta - """ - input_file = f"{here.parent}/samples/EOG092C0CZK_aa_aln.fasta" - output_file = "output/EOG092C0CZK_aa_aln.fasta.clipkit" - in_file_format = 'fasta' - out_file_format = 'fasta' - - kwargs = dict( - input_file=input_file, - output_file=output_file, - input_file_format='fasta', - output_file_format='fasta', - complement=False, - gaps=0.9, - mode=TrimmingMode.gappy, - use_log=False, - ) - execute(**kwargs) - - with open( - f"{here.parent}/expected/EOG092C0CZK_aa_aln.fasta_gappy", "r" - ) as expected: - expected_content = expected.read() - - with open(output_file, "r") as out_file: - output_content = out_file.read() - - assert expected_content == output_content - def test_EOG092C4VOX_aa(self): """ test gappy with amino alignment of fungal sequences @@ -266,8 +250,6 @@ def test_EOG092C4VOX_aa(self): """ input_file = f"{here.parent}/samples/EOG092C4VOX_aa_aln.fasta" output_file = "output/EOG092C4VOX_aa_aln.fasta.clipkit" - in_file_format = 'fasta' - out_file_format = 'fasta' kwargs = dict( input_file=input_file, @@ -301,8 +283,6 @@ def test_simple(self): """ input_file = f"{here.parent}/samples/simple.fa" output_file = "output/simpla.fa.clipkit" - in_file_format = 'fasta' - out_file_format = 'fasta' kwargs = dict( input_file=input_file, @@ -333,8 +313,6 @@ def test_12_YIL115C_Anc_2_253_codon_aln(self): """ input_file = f"{here.parent}/samples/12_YIL115C_Anc_2.253_codon_aln.fasta" output_file = "output/12_YIL115C_Anc_2.253_codon_aln.fasta.clipkit" - in_file_format = 'fasta' - out_file_format = 'fasta' kwargs = dict( input_file=input_file, @@ -451,6 +429,7 @@ def test_EOG092C0CZK_aa(self): assert expected_content == output_content + @pytest.mark.slow def test_EOG092C4VOX_aa(self): """ test gappy with amino alignment of fungal sequences diff --git a/tests/integration/test_kpi_gappy_mode.py b/tests/integration/test_kpi_gappy_mode.py index 4ab0e1e..bd0efbf 100644 --- a/tests/integration/test_kpi_gappy_mode.py +++ b/tests/integration/test_kpi_gappy_mode.py @@ -37,6 +37,33 @@ def test_simple(self): assert expected_content == output_content + def test_simple_long_description(self): + """ + usage: clipkit simple.fa -m kpi-gappy + """ + input_file = f"{here.parent}/samples/simple_long_description.fa" + output_file = "output/simple_long_description.fa.clipkit_kpi_gappy" + + kwargs = dict( + input_file=input_file, + output_file=output_file, + input_file_format='fasta', + output_file_format='fasta', + complement=False, + gaps=0.9, + mode=TrimmingMode.kpi_gappy, + use_log=False, + ) + execute(**kwargs) + + with open(f"{here.parent}/expected/simple_long_description.fa_kpi_gappy", "r") as expected: + expected_content = expected.read() + + with open(output_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content + def test_12_YIL115C_Anc_2_253_codon_aln(self): """ test kpi_gappy with codon alignment of yeast sequences @@ -219,36 +246,6 @@ def test_EOG091N44M8_nt(self): assert expected_content == output_content @pytest.mark.slow - def test_EOG092C0CZK_aa(self): - """ - test kpi_gappy with amino alignment of fungal sequences - usage: clipkit EOG092C0CZK_aa_aln.fasta -m kpi-gappy - """ - input_file = f"{here.parent}/samples/EOG092C0CZK_aa_aln.fasta" - output_file = "output/EOG092C0CZK_aa_aln.fasta.clipkit" - - kwargs = dict( - input_file=input_file, - output_file=output_file, - input_file_format='fasta', - output_file_format='fasta', - complement=False, - gaps=0.9, - mode=TrimmingMode.kpi_gappy, - use_log=False, - ) - execute(**kwargs) - - with open( - f"{here.parent}/expected/EOG092C0CZK_aa_aln.fasta_kpi_gappy", "r" - ) as expected: - expected_content = expected.read() - - with open(output_file, "r") as out_file: - output_content = out_file.read() - - assert expected_content == output_content - def test_EOG092C4VOX_aa(self): """ test gappy with amino alignment of fungal sequences @@ -436,6 +433,7 @@ def test_EOG092C0CZK_aa(self): assert expected_content == output_content + @pytest.mark.slow def test_EOG092C4VOX_aa(self): """ test kpi_gappy with amino alignment of fungal sequences diff --git a/tests/integration/test_kpi_mode.py b/tests/integration/test_kpi_mode.py index f5bd4fa..3dcbadc 100644 --- a/tests/integration/test_kpi_mode.py +++ b/tests/integration/test_kpi_mode.py @@ -37,6 +37,33 @@ def test_simple(self): assert expected_content == output_content + def test_simple_long_description(self): + """ + usage: clipkit simple.fa -kpi + """ + input_file = f"{here.parent}/samples/simple_long_description.fa" + output_file = "output/simple_long_description.fa.TestKpiMode_test_simple.clipkit" + + kwargs = dict( + input_file=input_file, + output_file=output_file, + input_file_format='fasta', + output_file_format='fasta', + complement=False, + gaps=0.9, + mode=TrimmingMode.kpi, + use_log=False, + ) + execute(**kwargs) + + with open(f"{here.parent}/expected/simple_long_description.fa_kpi", "r") as expected: + expected_content = expected.read() + + with open(output_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content + def test_12_YIL115C_Anc_2_253_codon_aln(self): """ test kpi with codon alignment of yeast sequences @@ -214,36 +241,6 @@ def test_EOG091N44M8_nt(self): assert expected_content == output_content @pytest.mark.slow - def test_EOG092C0CZK_aa(self): - """ - test kpi with amino alignment of fungal sequences - usage: clipkit EOG092C0CZK_aa_aln.fasta -m kpi - """ - input_file = f"{here.parent}/samples/EOG092C0CZK_aa_aln.fasta" - output_file = "output/EOG092C0CZK_aa_aln.fasta.clipkit" - - kwargs = dict( - input_file=input_file, - output_file=output_file, - input_file_format='fasta', - output_file_format='fasta', - complement=False, - gaps=0.9, - mode=TrimmingMode.kpi, - use_log=False, - ) - execute(**kwargs) - - with open( - f"{here.parent}/expected/EOG092C0CZK_aa_aln.fasta_kpi", "r" - ) as expected: - expected_content = expected.read() - - with open(output_file, "r") as out_file: - output_content = out_file.read() - - assert expected_content == output_content - def test_EOG092C4VOX_aa(self): """ test kpi with amino alignment of fungal sequences diff --git a/tests/integration/test_kpi_smart_gap_mode.py b/tests/integration/test_kpi_smart_gap_mode.py index 4feebec..b01da8b 100644 --- a/tests/integration/test_kpi_smart_gap_mode.py +++ b/tests/integration/test_kpi_smart_gap_mode.py @@ -38,6 +38,34 @@ def test_simple_no_change(self): assert expected_content == output_content + def test_simple_long_description(self): + """ + usage: clipkit simple_long_description.fa -m kpi-smart-gap + """ + input_file = f"{here.parent}/samples/simple_long_description.fa" + output_file = "output/simple_long_description.fa_kpi_smart_gaps" + kwargs = dict( + input_file=input_file, + output_file=output_file, + input_file_format='fasta', + output_file_format='fasta', + complement=False, + gaps=0.8, + mode=TrimmingMode.kpi_smart_gap, + use_log=False, + ) + execute(**kwargs) + + with open( + f"{here.parent}/expected/simple_long_description.fa_kpi_smart_gaps", "r" + ) as expected: + expected_content = expected.read() + + with open(output_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content + def test_12_YIL115C_Anc_2_253_codon_aln(self): """ test gappy with codon alignment of yeast sequences @@ -227,38 +255,6 @@ def test_EOG091N44M8_nt(self): assert expected_content == output_content @pytest.mark.slow - def test_EOG092C0CZK_aa(self): - """ - test gappy with amino alignment of fungal sequences - usage: clipkit EOG092C0CZK_aa_aln.fasta -m kpi-smart-gap - """ - input_file = f"{here.parent}/samples/EOG092C0CZK_aa_aln.fasta" - output_file = "output/EOG092C0CZK_aa_aln.fasta.clipkit" - in_file_format = 'fasta' - out_file_format = 'fasta' - - kwargs = dict( - input_file=input_file, - output_file=output_file, - input_file_format='fasta', - output_file_format='fasta', - complement=False, - gaps=0.9986, - mode=TrimmingMode.kpi_smart_gap, - use_log=False, - ) - execute(**kwargs) - - with open( - f"{here.parent}/expected/EOG092C0CZK_aa_aln.clipkit_kpi_smart_gaps", "r" - ) as expected: - expected_content = expected.read() - - with open(output_file, "r") as out_file: - output_content = out_file.read() - - assert expected_content == output_content - def test_EOG092C4VOX_aa(self): """ test gappy with amino alignment of fungal sequences diff --git a/tests/integration/test_kpic_gappy_mode.py b/tests/integration/test_kpic_gappy_mode.py index fd4d1a4..23fe9d6 100644 --- a/tests/integration/test_kpic_gappy_mode.py +++ b/tests/integration/test_kpic_gappy_mode.py @@ -37,6 +37,33 @@ def test_simple(self): assert expected_content == output_content + def test_simple_long_description(self): + """ + usage: clipkit simple_long_description.fa -m kpic-gappy + """ + input_file = f"{here.parent}/samples/simple_long_description.fa" + output_file = "output/simple_long_description.fa.TestKpiMode_test_simple.clipkit" + + kwargs = dict( + input_file=input_file, + output_file=output_file, + input_file_format='fasta', + output_file_format='fasta', + complement=False, + gaps=0.9, + mode=TrimmingMode.kpic_gappy, + use_log=False, + ) + execute(**kwargs) + + with open(f"{here.parent}/expected/simple_long_description.fa_kpic_gappy", "r") as expected: + expected_content = expected.read() + + with open(output_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content + def test_12_YIL115C_Anc_2_253_codon_aln(self): """ test kpic-gappy with codon alignment of yeast sequences @@ -219,36 +246,6 @@ def test_EOG091N44M8_nt(self): assert expected_content == output_content @pytest.mark.slow - def test_EOG092C0CZK_aa(self): - """ - test kpic-gappy with amino alignment of fungal sequences - usage: clipkit EOG092C0CZK_aa_aln.fasta -m kpic-gappy - """ - input_file = f"{here.parent}/samples/EOG092C0CZK_aa_aln.fasta" - output_file = "output/EOG092C0CZK_aa_aln.fasta.clipkitc" - - kwargs = dict( - input_file=input_file, - output_file=output_file, - input_file_format='fasta', - output_file_format='fasta', - complement=False, - gaps=0.9, - mode=TrimmingMode.kpic_gappy, - use_log=False, - ) - execute(**kwargs) - - with open( - f"{here.parent}/expected/EOG092C0CZK_aa_aln.fasta_kpic_gappy", "r" - ) as expected: - expected_content = expected.read() - - with open(output_file, "r") as out_file: - output_content = out_file.read() - - assert expected_content == output_content - def test_EOG092C4VOX_aa(self): """ test kpic-gappy with amino alignment of fungal sequences diff --git a/tests/integration/test_kpic_mode.py b/tests/integration/test_kpic_mode.py index a0e27a3..141a777 100644 --- a/tests/integration/test_kpic_mode.py +++ b/tests/integration/test_kpic_mode.py @@ -37,6 +37,33 @@ def test_simple(self): assert expected_content == output_content + def test_simple_long_description(self): + """ + usage: clipkit simple_long_description.fa -m kpic + """ + input_file = f"{here.parent}/samples/simple_long_description.fa" + output_file = "output/simple_long_description.fa.TestKpiMode_test_simple.clipkit" + + kwargs = dict( + input_file=input_file, + output_file=output_file, + input_file_format='fasta', + output_file_format='fasta', + complement=False, + gaps=0.9, + mode=TrimmingMode.kpic, + use_log=False, + ) + execute(**kwargs) + + with open(f"{here.parent}/expected/simple_long_description.fa_kpic", "r") as expected: + expected_content = expected.read() + + with open(output_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content + def test_12_YIL115C_Anc_2_253_codon_aln(self): """ test kpic with codon alignment of yeast sequences @@ -214,36 +241,6 @@ def test_EOG091N44M8_nt(self): assert expected_content == output_content @pytest.mark.slow - def test_EOG092C0CZK_aa(self): - """ - test kpic with amino alignment of fungal sequences - usage: clipkit EOG092C0CZK_aa_aln.fasta -m kpic - """ - input_file = f"{here.parent}/samples/EOG092C0CZK_aa_aln.fasta" - output_file = "output/EOG092C0CZK_aa_aln.fasta.clipkitc" - - kwargs = dict( - input_file=input_file, - output_file=output_file, - input_file_format='fasta', - output_file_format='fasta', - complement=False, - gaps=0.9, - mode=TrimmingMode.kpic, - use_log=False, - ) - execute(**kwargs) - - with open( - f"{here.parent}/expected/EOG092C0CZK_aa_aln.fasta_kpic", "r" - ) as expected: - expected_content = expected.read() - - with open(output_file, "r") as out_file: - output_content = out_file.read() - - assert expected_content == output_content - def test_EOG092C4VOX_aa(self): """ test kpic with amino alignment of fungal sequences diff --git a/tests/integration/test_kpic_smart_gap_mode.py b/tests/integration/test_kpic_smart_gap_mode.py index 32b0d3e..e4ba713 100644 --- a/tests/integration/test_kpic_smart_gap_mode.py +++ b/tests/integration/test_kpic_smart_gap_mode.py @@ -38,6 +38,34 @@ def test_simple_no_change(self): assert expected_content == output_content + def test_simple_no_change(self): + """ + usage: clipkit simple_long_description.fa -m kpic-smart-gap + """ + input_file = f"{here.parent}/samples/simple_long_description.fa" + output_file = "output/simple_long_description.fa_kpic_smart_gaps" + kwargs = dict( + input_file=input_file, + output_file=output_file, + input_file_format='fasta', + output_file_format='fasta', + complement=False, + gaps=0.8, + mode=TrimmingMode.kpic_smart_gap, + use_log=False, + ) + execute(**kwargs) + + with open( + f"{here.parent}/expected/simple_long_description.fa_kpic_smart_gaps", "r" + ) as expected: + expected_content = expected.read() + + with open(output_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content + def test_12_YIL115C_Anc_2_253_codon_aln(self): """ test gappy with codon alignment of yeast sequences @@ -227,38 +255,6 @@ def test_EOG091N44M8_nt(self): assert expected_content == output_content @pytest.mark.slow - def test_EOG092C0CZK_aa(self): - """ - test gappy with amino alignment of fungal sequences - usage: clipkit EOG092C0CZK_aa_aln.fasta -m kpic-smart-gap - """ - input_file = f"{here.parent}/samples/EOG092C0CZK_aa_aln.fasta" - output_file = "output/EOG092C0CZK_aa_aln.fasta.clipkit" - in_file_format = 'fasta' - out_file_format = 'fasta' - - kwargs = dict( - input_file=input_file, - output_file=output_file, - input_file_format='fasta', - output_file_format='fasta', - complement=False, - gaps=0.9986, - mode=TrimmingMode.kpic_smart_gap, - use_log=False, - ) - execute(**kwargs) - - with open( - f"{here.parent}/expected/EOG092C0CZK_aa_aln.clipkit_kpic_smart_gaps", "r" - ) as expected: - expected_content = expected.read() - - with open(output_file, "r") as out_file: - output_content = out_file.read() - - assert expected_content == output_content - def test_EOG092C4VOX_aa(self): """ test gappy with amino alignment of fungal sequences diff --git a/tests/integration/test_smart_gap_mode.py b/tests/integration/test_smart_gap_mode.py index ff8bcc0..a96b1ce 100644 --- a/tests/integration/test_smart_gap_mode.py +++ b/tests/integration/test_smart_gap_mode.py @@ -38,6 +38,34 @@ def test_simple_no_change(self): assert expected_content == output_content + def test_simple_simple_long_description_no_change(self): + """ + usage: clipkit simple_long_description.fa + """ + input_file = f"{here.parent}/samples/simple_long_description.fa" + output_file = "output/simple_long_description.fa_smart_gaps" + kwargs = dict( + input_file=input_file, + output_file=output_file, + input_file_format='fasta', + output_file_format='fasta', + complement=False, + gaps=0.8, + mode=TrimmingMode.smart_gap, + use_log=False, + ) + execute(**kwargs) + + with open( + f"{here.parent}/expected/simple_long_description.fa_smart_gaps", "r" + ) as expected: + expected_content = expected.read() + + with open(output_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content + def test_12_YIL115C_Anc_2_253_codon_aln(self): """ test gappy with codon alignment of yeast sequences @@ -227,38 +255,6 @@ def test_EOG091N44M8_nt(self): assert expected_content == output_content @pytest.mark.slow - def test_EOG092C0CZK_aa(self): - """ - test gappy with amino alignment of fungal sequences - usage: clipkit EOG092C0CZK_aa_aln.fasta - """ - input_file = f"{here.parent}/samples/EOG092C0CZK_aa_aln.fasta" - output_file = "output/EOG092C0CZK_aa_aln.fasta.clipkit" - in_file_format = 'fasta' - out_file_format = 'fasta' - - kwargs = dict( - input_file=input_file, - output_file=output_file, - input_file_format='fasta', - output_file_format='fasta', - complement=False, - gaps=0.9986, - mode=TrimmingMode.smart_gap, - use_log=False, - ) - execute(**kwargs) - - with open( - f"{here.parent}/expected/EOG092C0CZK_aa_aln.clipkit_smart_gaps", "r" - ) as expected: - expected_content = expected.read() - - with open(output_file, "r") as out_file: - output_content = out_file.read() - - assert expected_content == output_content - def test_EOG092C4VOX_aa(self): """ test gappy with amino alignment of fungal sequences diff --git a/tests/integration/test_writing_to_different_output_file_formats.py b/tests/integration/test_writing_to_different_output_file_formats.py index edfb751..122e278 100644 --- a/tests/integration/test_writing_to_different_output_file_formats.py +++ b/tests/integration/test_writing_to_different_output_file_formats.py @@ -205,3 +205,171 @@ def test_stockholm(self): output_content = out_file.read() assert expected_content == output_content + + def test_clustal_long_description(self): + """ + test output in clustal format + usage: clipkit simple_long_description.fa -of clustal + """ + input_file = f"{here.parent}/samples/simple_long_description.fa" + output_file = "output/simple_long_description.clustal" + + kwargs = dict( + input_file=input_file, + output_file=output_file, + input_file_format='fasta', + output_file_format='clustal', + complement=False, + gaps=0.9, + mode=TrimmingMode.gappy, + use_log=False, + ) + execute(**kwargs) + + with open(f"{here.parent}/expected/simple_long_description.clustal", "r") as expected: + expected_content = expected.read() + + with open(output_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content + + def test_maf_long_description(self): + """ + test output in maf format + usage: clipkit simple_long_description.fa -of maf + """ + input_file = f"{here.parent}/samples/simple_long_description.fa" + output_file = "output/simple_long_description.maf" + + kwargs = dict( + input_file=input_file, + output_file=output_file, + input_file_format='fasta', + output_file_format='maf', + complement=False, + gaps=0.9, + mode=TrimmingMode.gappy, + use_log=False, + ) + execute(**kwargs) + + with open(f"{here.parent}/expected/simple_long_description.maf", "r") as expected: + expected_content = expected.read() + + with open(output_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content + + def test_mauve_long_description(self): + """ + test output in mauve format + usage: clipkit simple_long_description.fa -of mauve + """ + input_file = f"{here.parent}/samples/simple_long_description.fa" + output_file = "output/simple_long_description.mauve" + + kwargs = dict( + input_file=input_file, + output_file=output_file, + input_file_format='fasta', + output_file_format='mauve', + complement=False, + gaps=0.9, + mode=TrimmingMode.gappy, + use_log=False, + ) + execute(**kwargs) + + with open(f"{here.parent}/expected/simple_long_description.mauve", "r") as expected: + expected_content = expected.read() + + with open(output_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content + + def test_phylip_long_description(self): + """ + test output in phylip format + usage: clipkit simple_long_description.fa -of phylip + """ + input_file = f"{here.parent}/samples/simple_long_description.fa" + output_file = "output/simple_long_description.phylip" + + kwargs = dict( + input_file=input_file, + output_file=output_file, + input_file_format='fasta', + output_file_format='phylip', + complement=False, + gaps=0.9, + mode=TrimmingMode.gappy, + use_log=False, + ) + execute(**kwargs) + + with open(f"{here.parent}/expected/simple_long_description.phylip", "r") as expected: + expected_content = expected.read() + + with open(output_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content + + def test_phylip_sequential_long_description(self): + """ + test output in phylip-sequential format + usage: clipkit simple_long_description.fa -of phylip-sequential + """ + input_file = f"{here.parent}/samples/simple_long_description.fa" + output_file = "output/simple_long_description.phylip-sequential" + + kwargs = dict( + input_file=input_file, + output_file=output_file, + input_file_format='fasta', + output_file_format='phylip_sequential', + complement=False, + gaps=0.9, + mode=TrimmingMode.gappy, + use_log=False, + ) + execute(**kwargs) + + with open(f"{here.parent}/expected/simple_long_description.phylip-sequential", "r") as expected: + expected_content = expected.read() + + with open(output_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content + + def test_stockholm_long_description(self): + """ + test output in stockholm format + usage: clipkit simple_long_description.fa -of stockholm + """ + input_file = f"{here.parent}/samples/simple_long_description.fa" + output_file = "output/simple_long_description.stockholm" + + kwargs = dict( + input_file=input_file, + output_file=output_file, + input_file_format='fasta', + output_file_format='stockholm', + complement=False, + gaps=0.9, + mode=TrimmingMode.gappy, + use_log=False, + ) + execute(**kwargs) + + with open(f"{here.parent}/expected/simple_long_description.stockholm", "r") as expected: + expected_content = expected.read() + + with open(output_file, "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content diff --git a/tests/integration/test_writing_to_log_file.py b/tests/integration/test_writing_to_log_file.py index a12f087..34b2e3d 100644 --- a/tests/integration/test_writing_to_log_file.py +++ b/tests/integration/test_writing_to_log_file.py @@ -38,6 +38,34 @@ def test_simple(self): assert expected_content == output_content + def test_simple_long_description(self): + """ + test output in clustal format + usage: clipkit simple_long_description.fa -l + """ + input_file = f"{here.parent}/samples/simple_long_description.fa" + output_file = "output/simple_long_description.fa.clipkit" + + kwargs = dict( + input_file=input_file, + output_file=output_file, + input_file_format='fasta', + output_file_format='fasta', + complement=False, + gaps=0.9, + mode=TrimmingMode.gappy, + use_log=True, + ) + execute(**kwargs) + + with open(f"{here.parent}/expected/simple_long_description.fa.clipkit.log", "r") as expected: + expected_content = expected.read() + + with open(f"{output_file}.log", "r") as out_file: + output_content = out_file.read() + + assert expected_content == output_content + def test_12_YIL115C_Anc_2_253_codon_aln(self): """ test output in clustal format diff --git a/tests/unit/test_modes.py b/tests/unit/test_modes.py index a727413..18c68b9 100644 --- a/tests/unit/test_modes.py +++ b/tests/unit/test_modes.py @@ -142,10 +142,10 @@ def test_gappy_mode(self): constant_site = False for entry in alignment: - keepD[entry.id] = np.empty([6], dtype=str) + keepD[entry.description] = np.empty([6], dtype=str) trimD = {} for entry in alignment: - trimD[entry.id] = np.empty([6], dtype=str) + trimD[entry.description] = np.empty([6], dtype=str) ## execution keepD, trimD = trim( @@ -199,10 +199,10 @@ def test_kpi_gappy_mode(self): use_log = False for entry in alignment: - keepD[entry.id] = np.empty([6], dtype=str) + keepD[entry.description] = np.empty([6], dtype=str) trimD = {} for entry in alignment: - trimD[entry.id] = np.empty([6], dtype=str) + trimD[entry.description] = np.empty([6], dtype=str) ## execution keepD, trimD = trim( @@ -256,10 +256,10 @@ def test_kpi_mode(self): use_log = False for entry in alignment: - keepD[entry.id] = np.empty([6], dtype=str) + keepD[entry.description] = np.empty([6], dtype=str) trimD = {} for entry in alignment: - trimD[entry.id] = np.empty([6], dtype=str) + trimD[entry.description] = np.empty([6], dtype=str) ## execution keepD, trimD = trim( @@ -313,10 +313,10 @@ def test_kpic_mode(self): use_log = False for entry in alignment: - keepD[entry.id] = np.empty([6], dtype=str) + keepD[entry.description] = np.empty([6], dtype=str) trimD = {} for entry in alignment: - trimD[entry.id] = np.empty([6], dtype=str) + trimD[entry.description] = np.empty([6], dtype=str) ## execution keepD, trimD = trim( @@ -370,10 +370,10 @@ def test_kpic_gappy_mode(self): use_log = False for entry in alignment: - keepD[entry.id] = np.empty([6], dtype=str) + keepD[entry.description] = np.empty([6], dtype=str) trimD = {} for entry in alignment: - trimD[entry.id] = np.empty([6], dtype=str) + trimD[entry.description] = np.empty([6], dtype=str) ## execution keepD, trimD = trim(