Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Numpy performance improvements #37

Merged
merged 16 commits into from
Sep 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions change_log.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
Major changes to ClipKIT are summarized here.

2.0.0 through 2.1.0
Introduce and refactor MSA class. Rely on Numpy functionality to accelerate processes.

1.3.0
long description of sequences, rather than identifiers, are kept in the ClipKIT output

Expand Down
4 changes: 2 additions & 2 deletions clipkit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from .clipkit import run
from .files import FileFormat
from .helpers import SeqType, write_keep_msa
from .helpers import SeqType, write_msa
from .logger import logger
from .modes import TrimmingMode

Expand Down Expand Up @@ -59,5 +59,5 @@ def clipkit(
if not output_file_path:
return trim_run, stats
else:
write_keep_msa(trim_run.keep_msa, output_file_path, trim_run.output_file_format)
write_msa(trim_run.msa, output_file_path, trim_run.output_file_format)
return output_file_path, stats
47 changes: 21 additions & 26 deletions clipkit/clipkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
from Bio.Align import MultipleSeqAlignment
from .args_processing import process_args
from .exceptions import InvalidInputFileFormat
from .files import get_alignment_and_format, FileFormat
from .files import get_alignment_and_format, FileFormat, write_debug_log_file
from .helpers import (
create_msa,
get_seq_type,
get_gap_chars,
keep_trim_and_log,
write_keep_msa,
write_trim_msa,
write_msa,
write_complement,
SeqType,
)
from .logger import logger, log_file_logger
Expand All @@ -23,16 +23,16 @@
from .parser import create_parser
from .settings import DEFAULT_AA_GAP_CHARS, DEFAULT_NT_GAP_CHARS
from .smart_gap_helper import smart_gap_threshold_determination
from .stats import TrimmingStats
from .version import __version__ as current_version
from .warnings import (
warn_if_all_sites_were_trimmed,
warn_if_entry_contains_only_gaps,
)
from .write import (
write_determining_smart_gap_threshold,
write_user_args,
write_output_stats,
write_processing_aln,
write_output_files_message,
)

from dataclasses import dataclass
Expand All @@ -41,23 +41,21 @@
@dataclass
class TrimRun:
alignment: MultipleSeqAlignment
keep_msa: MSA
trim_msa: MSA
msa: MSA
gap_characters: list
sequence_type: SeqType
input_file_format: FileFormat
output_file_format: FileFormat
site_classification_counts: dict
gaps: float
version: str = current_version

@property
def complement(self):
return self.trim_msa.to_bio_msa() if self.trim_msa else None
return self.msa.complement_to_bio_msa()

@property
def trimmed(self):
return self.keep_msa.to_bio_msa()
return self.msa.to_bio_msa()


def run(
Expand Down Expand Up @@ -98,28 +96,22 @@ def run(
TrimmingMode.kpi_smart_gap,
TrimmingMode.kpic_smart_gap,
}:
write_determining_smart_gap_threshold()
gaps = smart_gap_threshold_determination(alignment, gap_characters, quiet)
gaps = smart_gap_threshold_determination(alignment, gap_characters)

# instantiates MSAs to track what we keep/trim from the alignment
keep_msa, trim_msa, site_classification_counts = keep_trim_and_log(
alignment, gaps, mode, use_log, output_file, complement, gap_characters, quiet
)
msa = create_msa(alignment, gap_characters)
msa.trim(mode, gap_threshold=gaps)

trim_run = TrimRun(
alignment,
keep_msa,
trim_msa,
msa,
gap_characters,
sequence_type,
input_file_format,
output_file_format,
site_classification_counts,
gaps,
)
stats = TrimmingStats(trim_run.alignment, trim_run.keep_msa, trim_run.trim_msa)

return trim_run, stats
return trim_run, msa.stats


def execute(
Expand Down Expand Up @@ -149,6 +141,7 @@ def execute(
# for reporting runtime duration to user
start_time = time.time()

write_processing_aln()
trim_run, stats = run(
input_file,
input_file_format,
Expand All @@ -162,6 +155,7 @@ def execute(
use_log,
quiet,
)
write_output_files_message(output_file, complement, use_log)

# display to user what args are being used in stdout
write_user_args(
Expand All @@ -178,14 +172,15 @@ def execute(
)

if use_log:
warn_if_all_sites_were_trimmed(trim_run.keep_msa)
warn_if_entry_contains_only_gaps(trim_run.keep_msa, trim_run.sequence_type)
warn_if_all_sites_were_trimmed(trim_run.msa)
warn_if_entry_contains_only_gaps(trim_run.msa)
write_debug_log_file(trim_run.msa)

write_keep_msa(trim_run.keep_msa, output_file, trim_run.output_file_format)
write_msa(trim_run.msa, output_file, trim_run.output_file_format)

# if the -c/--complementary argument was used, create an alignment of the trimmed sequences
if complement:
write_trim_msa(trim_run.trim_msa, output_file, trim_run.output_file_format)
write_complement(trim_run.msa, output_file, trim_run.output_file_format)

write_output_stats(stats, start_time)

Expand Down
10 changes: 6 additions & 4 deletions clipkit/files.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
from enum import Enum
import logging
from .logger import log_file_logger

from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment

from .exceptions import InvalidInputFileFormat


log = logging.getLogger(__name__)


class FileFormat(Enum):
fasta = "fasta"
clustal = "clustal"
Expand Down Expand Up @@ -47,3 +44,8 @@ def get_alignment_and_format(
continue

raise InvalidInputFileFormat("File could not be read")


def write_debug_log_file(msa):
for info in msa.generate_debug_log_info():
log_file_logger.debug(f"{str(info[0] + 1)} {info[1]} {info[2].value} {info[3]}")
150 changes: 11 additions & 139 deletions clipkit/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
from tqdm import tqdm

from .msa import MSA
from .modes import SiteClassificationType, TrimmingMode, trim
from .modes import TrimmingMode
from .settings import DEFAULT_AA_GAP_CHARS, DEFAULT_NT_GAP_CHARS
from .files import FileFormat
from .write import write_processing_aln, write_output_files_message
from .stats import TrimmingStats

from enum import Enum

Expand Down Expand Up @@ -38,102 +38,25 @@ def get_seq_type(alignment: MultipleSeqAlignment) -> SeqType:
return sequence_type


def get_alignment_column(alignment: MultipleSeqAlignment, index: int) -> str:
alignment_column = ""
alignment_column += alignment[:, index]
return alignment_column.upper()


def get_gap_chars(seq_type: SeqType) -> list[str]:
if seq_type == SeqType.nt:
return DEFAULT_NT_GAP_CHARS
else:
return DEFAULT_AA_GAP_CHARS


def report_column_features(
alignment: MultipleSeqAlignment, index: int, gap_chars: list
) -> tuple[str, float]:
"""
Count the occurence of each character at a given position
in an alignment. This information is used to determine
if the alignment is parsimony informative or not. When
counting characters, gaps are excluded.
"""
alignment_column = get_alignment_column(alignment, index)

column_length = len(alignment_column)

number_of_gaps = sum([alignment_column.count(char) for char in gap_chars])
gappyness = number_of_gaps / column_length

return alignment_column, gappyness


def count_characters_at_position(alignment_column: str, gap_chars: list) -> dict:
character_counts = {}

alignment_column = remove_gaps(alignment_column, gap_chars)
for char in set(alignment_column):
character_counts[char] = alignment_column.count(char)
return character_counts


def determine_site_classification_type(
character_counts: dict,
) -> SiteClassificationType:
"""
Determines if a site is parsimony informative or constant.
A site is parsimony-informative if it contains at least two types of nucleotides
(or amino acids), and at least two of them occur with a minimum frequency of two.
https://www.megasoftware.net/web_help_7/rh_parsimony_informative_site.htm

A site is constant if it contains only one character and that character occurs
at least twice. https://www.megasoftware.net/web_help_7/rh_constant_site.htm

A singleton is a site that contains at least two types of characters with, at most,
one occuring multiple times. https://www.megasoftware.net/web_help_7/rh_singleton_sites.htm
"""
parsimony_informative_threshold = 2
counts_gte_threshold = 0

for count in character_counts.values():
if count >= 2:
counts_gte_threshold += 1
if counts_gte_threshold >= parsimony_informative_threshold:
return SiteClassificationType.parsimony_informative

if counts_gte_threshold == 1 and len(character_counts) == 1:
return SiteClassificationType.constant
elif counts_gte_threshold == 1 and len(character_counts) > 1:
return SiteClassificationType.singleton

return SiteClassificationType.other


def create_keep_and_trim_msas(
alignment: MultipleSeqAlignment, complement: bool
) -> tuple[MSA, MSA]:
def create_msa(alignment: MultipleSeqAlignment, gap_chars: list[str] = None) -> MSA:
"""
Creates MSA classes to track sites kept and trimmed
Create MSA class
"""
return MSA.from_bio_msa(alignment, gap_chars)

keep_msa = MSA.from_bio_msa(alignment)
if complement:
trim_msa = MSA.from_bio_msa(alignment)
else:
trim_msa = None

return keep_msa, trim_msa


def write_keep_msa(
keep_msa: MSA, out_file_name: str, out_file_format: FileFormat
) -> None:
def write_msa(msa: MSA, out_file_name: str, out_file_format: FileFormat) -> None:
"""
keep_msa is populated with sites that are kept after trimming is finished
msa is populated with sites that are kept after trimming is finished
"""
output_msa = keep_msa.to_bio_msa()
output_msa = msa.to_bio_msa()
if out_file_format.value == "phylip_relaxed":
SeqIO.write(output_msa, out_file_name, "phylip-relaxed")
elif out_file_format.value == "phylip_sequential":
Expand All @@ -142,65 +65,14 @@ def write_keep_msa(
SeqIO.write(output_msa, out_file_name, out_file_format.value)


def write_trim_msa(trim_msa: MSA, out_file: str, out_file_format: FileFormat) -> None:
def write_complement(msa: MSA, out_file: str, out_file_format: FileFormat) -> None:
"""
trim_msa is populated with sites that are trimmed after trimming is finished
msa is populated with sites that are trimmed after trimming is finished
"""
output_msa = trim_msa.to_bio_msa()
output_msa = msa.complement_to_bio_msa()
completmentOut = str(out_file) + ".complement"
if out_file_format.value == "phylip_relaxed":
SeqIO.write(output_msa, out_file, "phylip-relaxed")
elif out_file_format.value == "phylip_sequential":
SeqIO.write(output_msa, out_file, "phylip-sequential")
SeqIO.write(output_msa, completmentOut, out_file_format.value)


def keep_trim_and_log(
alignment: MultipleSeqAlignment,
gaps: float,
mode: TrimmingMode,
use_log: bool,
out_file_name: str,
complement: bool,
gap_chars: list,
quiet: bool,
) -> tuple[MSA, MSA]:
"""
Determines positions to keep or trim and saves these positions on the MSA classes.
"""
# initialize MSA classes
keep_msa, trim_msa = create_keep_and_trim_msas(alignment, complement)

alignment_length = alignment.get_alignment_length()

site_classification_counts = dict()
site_classification_counts[SiteClassificationType.parsimony_informative] = 0
site_classification_counts[SiteClassificationType.constant] = 0
site_classification_counts[SiteClassificationType.singleton] = 0
site_classification_counts[SiteClassificationType.other] = 0

write_processing_aln()
for i in tqdm(range(alignment_length), disable=quiet, postfix="trimmer"):
sequence_at_index, gappyness = report_column_features(alignment, i, gap_chars)

character_counts = count_characters_at_position(sequence_at_index, gap_chars)

# determine if a site is parsimony informative, singleton, or constant
site_classification_type = determine_site_classification_type(character_counts)

keep_msa, trim_msa = trim(
gappyness,
site_classification_type,
site_classification_counts,
keep_msa,
trim_msa,
i,
gaps,
alignment,
mode,
use_log,
)

# inform user that output files are being written
write_output_files_message(out_file_name, complement, use_log)
return keep_msa, trim_msa, site_classification_counts
Loading