Skip to content

Commit

Permalink
version bump. entire sequence description, rather than just the ident…
Browse files Browse the repository at this point in the history
…ifier, is kept in the output files of ClipKIT
  • Loading branch information
JLSteenwyk committed Dec 7, 2021
1 parent 001f251 commit be991d7
Show file tree
Hide file tree
Showing 37 changed files with 660 additions and 286 deletions.
9 changes: 6 additions & 3 deletions change_log.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
Major changes to ClipKIT are summarized here.

1.3.0
long description of sequences, rather than identifiers, are kept in the ClipKIT output

1.1.5
carried over code base to biopython, v1.79

1.1.0
smart-gap trimming is introduced and is now the default trimming approach used in ClipKIT.
smart-gap trimming is a dynamic approach to determine the appropriate gaps threshold for an alignment.

1.1.5
carried over code base to biopython, v1.79
7 changes: 3 additions & 4 deletions clipkit/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,11 @@ def populate_empty_keepD_and_trimD(alignment):
biopython multiple sequence alignment object
"""
keepD = {}
alignment_length = alignment.get_alignment_length()
for entry in alignment:
keepD[entry.id] = np.zeros([alignment_length], dtype=bytes)
trimD = {}
alignment_length = alignment.get_alignment_length()
for entry in alignment:
trimD[entry.id] = np.zeros([alignment_length], dtype=bytes)
keepD[entry.description] = np.zeros([alignment_length], dtype=bytes)
trimD[entry.description] = np.zeros([alignment_length], dtype=bytes)

return keepD, trimD

Expand Down
4 changes: 2 additions & 2 deletions clipkit/modes.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def trim(
# save to keepD
if shouldKeep(mode, parsimony_informative, constant_site, gappyness, gaps):
for entry in alignment:
keepD[entry.id][alignment_position] = entry.seq._data[alignment_position:alignment_position+1]
keepD[entry.description][alignment_position] = entry.seq._data[alignment_position:alignment_position+1]
if use_log:
if constant_site:
logger.debug(f"{str(alignment_position + 1)} keep Const {gappyness}")
Expand All @@ -93,7 +93,7 @@ def trim(
# save to trimD
else:
for entry in alignment:
trimD[entry.id][alignment_position] = entry.seq._data[alignment_position:alignment_position+1]
trimD[entry.description][alignment_position] = entry.seq._data[alignment_position:alignment_position+1]
if use_log:
if constant_site:
logger.debug(f"{str(alignment_position + 1)} trim Const {gappyness}")
Expand Down
2 changes: 1 addition & 1 deletion clipkit/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.2.0'
__version__ = '1.3.0'
6 changes: 6 additions & 0 deletions docs/change_log/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ Change log

Major changes to ClipKIT are summarized here.

**1.3.0**
long description of sequences, rather than identifiers, are kept in the ClipKIT output

**1.1.5**
carried over code base to biopython, v1.79

**1.1.0:**
smart-gap trimming is introduced and is now the default trimming approach used in ClipKIT.
smart-gap trimming is a dynamic approach to determine the appropriate gaps threshold for an alignment.
10 changes: 10 additions & 0 deletions tests/integration/expected/simple_long_description.clustal
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
CLUSTAL X (1.81) multiple sequence alignment


1_extra_info A-GTAT
2_extra_info A-G-AT
3_extra_info A-G-TA
4_extra_info AGA-TA
5_extra_info ACa-T-


10 changes: 10 additions & 0 deletions tests/integration/expected/simple_long_description.fa.clipkit
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>1 extra_info
A-GTAT
>2 extra_info
A-G-AT
>3 extra_info
A-G-TA
>4 extra_info
AGA-TA
>5 extra_info
ACa-T-
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
1 keep Const 0.0
2 keep nConst,nPI 0.6
3 keep PI 0.0
4 keep nConst,nPI 0.8
5 keep PI 0.0
6 keep PI 0.2
10 changes: 10 additions & 0 deletions tests/integration/expected/simple_long_description.fa_gappy
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>1 extra_info
A-GTAT
>2 extra_info
A-G-AT
>3 extra_info
A-G-TA
>4 extra_info
AGA-TA
>5 extra_info
ACa-T-
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
>1 extra_info
>2 extra_info
>3 extra_info
>4 extra_info
>5 extra_info
10 changes: 10 additions & 0 deletions tests/integration/expected/simple_long_description.fa_kpi
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>1 extra_info
GAT
>2 extra_info
GAT
>3 extra_info
GTA
>4 extra_info
ATA
>5 extra_info
aT-
10 changes: 10 additions & 0 deletions tests/integration/expected/simple_long_description.fa_kpi_gappy
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>1 extra_info
GAT
>2 extra_info
GAT
>3 extra_info
GTA
>4 extra_info
ATA
>5 extra_info
aT-
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>1 extra_info
GAT
>2 extra_info
GAT
>3 extra_info
GTA
>4 extra_info
ATA
>5 extra_info
aT-
10 changes: 10 additions & 0 deletions tests/integration/expected/simple_long_description.fa_kpic
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>1 extra_info
AGAT
>2 extra_info
AGAT
>3 extra_info
AGTA
>4 extra_info
AATA
>5 extra_info
AaT-
10 changes: 10 additions & 0 deletions tests/integration/expected/simple_long_description.fa_kpic_gappy
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>1 extra_info
AGAT
>2 extra_info
AGAT
>3 extra_info
AGTA
>4 extra_info
AATA
>5 extra_info
AaT-
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>1 extra_info
AGAT
>2 extra_info
AGAT
>3 extra_info
AGTA
>4 extra_info
AATA
>5 extra_info
AaT-
10 changes: 10 additions & 0 deletions tests/integration/expected/simple_long_description.fa_smart_gaps
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>1 extra_info
A-GAT
>2 extra_info
A-GAT
>3 extra_info
A-GTA
>4 extra_info
AGATA
>5 extra_info
ACaT-
10 changes: 10 additions & 0 deletions tests/integration/expected/simple_long_description.maf
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
##maf version=1 scoring=none
# generated by Biopython

a score=0.00
s 1_extra_info 0 5 + 0 A-GTAT
s 2_extra_info 0 4 + 0 A-G-AT
s 3_extra_info 0 4 + 0 A-G-TA
s 4_extra_info 0 5 + 0 AGA-TA
s 5_extra_info 0 4 + 0 ACa-T-

17 changes: 17 additions & 0 deletions tests/integration/expected/simple_long_description.mauve
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#FormatVersion Mauve1
#Sequence1Entry 1
#Sequence2Entry 2
#Sequence3Entry 3
#Sequence4Entry 4
#Sequence5Entry 5
> 1:0-0 + <unknown name>.fa # 1 extra_info
A-GTAT
> 2:0-0 + <unknown name>.fa # 2 extra_info
A-G-AT
> 3:0-0 + <unknown name>.fa # 3 extra_info
A-G-TA
> 4:0-0 + <unknown name>.fa # 4 extra_info
AGA-TA
> 5:0-0 + <unknown name>.fa # 5 extra_info
ACa-T-
=
6 changes: 6 additions & 0 deletions tests/integration/expected/simple_long_description.phylip
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
5 6
1 extra_in A-GTAT
2 extra_in A-G-AT
3 extra_in A-G-TA
4 extra_in AGA-TA
5 extra_in ACa-T-
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
5 6
1 extra_inA-GTAT
2 extra_inA-G-AT
3 extra_inA-G-TA
4 extra_inAGA-TA
5 extra_inACa-T-
13 changes: 13 additions & 0 deletions tests/integration/expected/simple_long_description.stockholm
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# STOCKHOLM 1.0
#=GF SQ 5
1_extra_info A-GTAT
#=GS 1_extra_info AC 1 extra_info
2_extra_info A-G-AT
#=GS 2_extra_info AC 2 extra_info
3_extra_info A-G-TA
#=GS 3_extra_info AC 3 extra_info
4_extra_info AGA-TA
#=GS 4_extra_info AC 4 extra_info
5_extra_info ACa-T-
#=GS 5_extra_info AC 5 extra_info
//
10 changes: 10 additions & 0 deletions tests/integration/samples/simple_long_description.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>1 extra_info
A-GTAT
>2 extra_info
A-G-AT
>3 extra_info
A-G-TA
>4 extra_info
AGA-TA
>5 extra_info
ACa-T-
10 changes: 10 additions & 0 deletions tests/integration/samples/simple_long_description.fa.clipkit
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>1 extra_info
A-GAT
>2 extra_info
A-GAT
>3 extra_info
A-GTA
>4 extra_info
AGATA
>5 extra_info
ACaT-
31 changes: 31 additions & 0 deletions tests/integration/test_complementary_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,37 @@ def test_simple_complement(self):

assert expected_content == output_content

def test_simple_long_description_complement(self):
"""
test complementary output file with a simple case
usage: clipkit simple_long_description.fa -c
"""
output_file = "output/simple_long_description.fa_gappy"
complement_out_file = f"{output_file}.complement"

kwargs = dict(
input_file=f"{here.parent}/samples/simple_long_description.fa",
output_file=output_file,
input_file_format='fasta',
output_file_format='fasta',
complement=True,
gaps=0.9,
mode=TrimmingMode.gappy,
use_log=False,
)

execute(**kwargs)

with open(
f"{here.parent}/expected/simple_long_description.fa_gappy.complement", "r"
) as expected:
expected_content = expected.read()

with open(complement_out_file, "r") as out_file:
output_content = out_file.read()

assert expected_content == output_content

def test_12_YIL115C_Anc_2_253_aa_aln_complement(self):
"""
test complementary output file for amino acid yeast sequences
Expand Down
Loading

0 comments on commit be991d7

Please sign in to comment.