Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/domain includelist #168

Merged
merged 6 commits into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions big_scape/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
validate_binning_cluster_workflow,
validate_binning_query_workflow,
validate_alignment_mode,
validate_includelist,
validate_includelist_all,
validate_includelist_any,
validate_gcf_cutoffs,
validate_filter_gbk,
validate_pfam_path,
validate_domain_include_list,
validate_classify,
validate_output_dir,
validate_query_record,
Expand All @@ -21,10 +23,12 @@
"validate_binning_cluster_workflow",
"validate_binning_query_workflow",
"validate_alignment_mode",
"validate_includelist",
"validate_includelist_all",
"validate_includelist_any",
"validate_gcf_cutoffs",
"validate_filter_gbk",
"validate_pfam_path",
"validate_domain_include_list",
"validate_classify",
"validate_output_dir",
"validate_query_record",
Expand Down
24 changes: 20 additions & 4 deletions big_scape/cli/cli_common_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
validate_not_empty_dir,
validate_input_mode,
validate_alignment_mode,
validate_includelist,
validate_includelist_all,
validate_includelist_any,
validate_gcf_cutoffs,
validate_filter_gbk,
validate_record_type,
Expand Down Expand Up @@ -218,13 +219,28 @@ def common_cluster_query(fn):
),
click.option(
# TODO: implement
"--domain_includelist_path",
"--domain_includelist_all_path",
type=click.Path(
exists=True, dir_okay=False, file_okay=True, path_type=Path
),
callback=validate_includelist,
callback=validate_includelist_all,
help=(
"Path to txt file with Pfam accessions. Only BGCs containing "
"Path to txt file with Pfam accessions. Only BGCs containing all "
"the listed accessions will be analysed. In this file, each "
"line contains a single Pfam accession (with an optional comment,"
" separated by a tab). Lines starting with '#' are ignored. Pfam "
"accessions are case-sensitive."
),
),
click.option(
# TODO: implement
"--domain_includelist_any_path",
type=click.Path(
exists=True, dir_okay=False, file_okay=True, path_type=Path
),
callback=validate_includelist_any,
help=(
"Path to txt file with Pfam accessions. Only BGCs containing any of "
"the listed accessions will be analysed. In this file, each "
"line contains a single Pfam accession (with an optional comment,"
" separated by a tab). Lines starting with '#' are ignored. Pfam "
Expand Down
64 changes: 59 additions & 5 deletions big_scape/cli/cli_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def validate_filter_gbk(ctx, param, filter_str) -> list[str]:
# hmmer parameters


def validate_includelist(ctx, param, domain_includelist_path) -> None:
def validate_includelist(ctx, param, domain_includelist_path):
"""Validate the path to the domain include list and return a list of domain
accession strings contained within this file

Expand All @@ -207,17 +207,24 @@ def validate_includelist(ctx, param, domain_includelist_path) -> None:

if not domain_includelist_path.exists():
logging.error("domain_includelist file does not exist!")
raise InvalidArgumentError("--domain_includelist", domain_includelist_path)
raise InvalidArgumentError(
"--domain_includelist_all/any_path", domain_includelist_path
)

with domain_includelist_path.open(encoding="utf-8") as domain_includelist_file:
lines = domain_includelist_file.readlines()

lines = [line.strip() for line in lines]
pfams = []

for line in lines:
line = line.strip()
elemts = line.split("\t")
pfams.append(elemts[0])

# expect Pfam accessions, i.e. PF00001 or PF00001.10
lines_valid = map(
lambda string: string.startswith("PF") and len(string) in range(7, 11),
lines,
pfams,
)

if not all(lines_valid):
Expand All @@ -228,7 +235,37 @@ def validate_includelist(ctx, param, domain_includelist_path) -> None:
"Invalid Pfam accession(s) found in file %s", domain_includelist_path
)

ctx.params["domain_includelist"] = lines
return pfams


def validate_includelist_all(ctx, param, domain_includelist_all_path) -> None:
"""Validate the path to the domain include list and return a list of domain
accession strings contained within this file

Returns:
list[str]: A list of domain accessions to include
"""

pfams = validate_includelist(ctx, param, domain_includelist_all_path)

ctx.params["domain_includelist_all"] = pfams

return None


def validate_includelist_any(ctx, param, domain_includelist_any_path) -> None:
"""Validate the path to the domain include list and return a list of domain
accession strings contained within this file

Returns:
list[str]: A list of domain accessions to include
"""

pfams = validate_includelist(ctx, param, domain_includelist_any_path)

ctx.params["domain_includelist_any"] = pfams

return None


# workflow validations
Expand Down Expand Up @@ -334,6 +371,23 @@ def validate_pfam_path(ctx) -> None:
)


def validate_domain_include_list(ctx) -> None:
"""Raise an error if both domain includelists are given at the same time"""

if (
ctx.obj["domain_includelist_all_path"]
and ctx.obj["domain_includelist_any_path"]
):
logging.error(
"You have selected both all and any domain_includelist options. "
"Please select only one of the two at a time."
)
raise click.UsageError(
"You have selected both all and any domain_includelist options. "
"Please select only one of the two at a time."
)


def validate_record_type(ctx, _, record_type) -> Optional[bs_enums.genbank.RECORD_TYPE]:
"""Validates whether a region_type is provided when running classify"""
valid_types = {mode.value: mode for mode in bs_enums.genbank.RECORD_TYPE}
Expand Down
2 changes: 2 additions & 0 deletions big_scape/cli/cluster_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
validate_output_paths,
validate_binning_cluster_workflow,
validate_pfam_path,
validate_domain_include_list,
set_start,
)

Expand Down Expand Up @@ -58,6 +59,7 @@ def cluster(ctx, *args, **kwargs):
# workflow validations
validate_binning_cluster_workflow(ctx)
validate_pfam_path(ctx)
validate_domain_include_list(ctx)
validate_output_paths(ctx)

# set start time and run label
Expand Down
12 changes: 7 additions & 5 deletions big_scape/comparison/comparable_region.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ def inflate_a(self, record_a: BGCRecord) -> None:
comparable region has already been inflated, so this method should only be
called once.
"""
a_cds_list = record_a.get_cds()
a_cds_with_domains = record_a.get_cds_with_domains()
a_cds_list = list(record_a.get_cds())
a_cds_with_domains = list(record_a.get_cds_with_domains())
lcs_a_start_orf_num = a_cds_with_domains[self.lcs_a_start].orf_num
lcs_a_stop_orf_num = a_cds_with_domains[self.lcs_a_stop - 1].orf_num
a_start_orf_num = a_cds_with_domains[self.a_start].orf_num
Expand Down Expand Up @@ -121,8 +121,8 @@ def inflate_b(self, record_b: BGCRecord) -> None:
comparable region has already been inflated, so this method should only be
called once.
"""
b_cds_list = record_b.get_cds()
b_cds_with_domains = record_b.get_cds_with_domains()
b_cds_list = list(record_b.get_cds())
b_cds_with_domains = list(record_b.get_cds_with_domains())

if self.reverse:
b_cds_list = b_cds_list[::-1]
Expand Down Expand Up @@ -246,7 +246,9 @@ def cds_range_contains_biosynthetic(
if end_inclusive:
stop += 1

for cds in record.get_cds_with_domains(reverse=reverse)[cds_start:stop]:
cds_list = list(record.get_cds_with_domains(reverse=reverse))

for cds in cds_list[cds_start:stop]:
if cds.gene_kind is None:
continue

Expand Down
12 changes: 6 additions & 6 deletions big_scape/comparison/extend.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ def reset(pair: RecordPair) -> None:

pair.comparable_region.a_start = 0
pair.comparable_region.b_start = 0
pair.comparable_region.a_stop = len(pair.record_a.get_cds_with_domains())
pair.comparable_region.b_stop = len(pair.record_b.get_cds_with_domains())
pair.comparable_region.a_stop = len(list(pair.record_a.get_cds_with_domains()))
pair.comparable_region.b_stop = len(list(pair.record_b.get_cds_with_domains()))
pair.comparable_region.domain_a_start = 0
pair.comparable_region.domain_b_start = 0
pair.comparable_region.domain_a_stop = len(pair.record_a.get_hsps())
pair.comparable_region.domain_b_stop = len(pair.record_b.get_hsps())
pair.comparable_region.domain_a_stop = len(list(pair.record_a.get_hsps()))
pair.comparable_region.domain_b_stop = len(list(pair.record_b.get_hsps()))

pair.comparable_region.reverse = False

Expand Down Expand Up @@ -108,8 +108,8 @@ def extend(
# get the cds lists
# TODO: base extend on all domains in case of protoclusters, allow extend beyond
# protocluster border
a_domains = pair.record_a.get_hsps()
b_domains = pair.record_b.get_hsps()
a_domains = list(pair.record_a.get_hsps())
b_domains = list(pair.record_b.get_hsps())

a_max_dist = math.floor(len(a_domains) * max_match_dist_perc)
b_max_dist = math.floor(len(b_domains) * max_match_dist_perc)
Expand Down
8 changes: 4 additions & 4 deletions big_scape/comparison/lcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,8 @@ def find_domain_lcs_region(
logging.debug("region lcs")

# these are regions, so we can get the full range of CDS
a_cds = pair.record_a.get_cds_with_domains()
b_cds = pair.record_b.get_cds_with_domains()
a_cds = list(pair.record_a.get_cds_with_domains())
b_cds = list(pair.record_b.get_cds_with_domains())

# working on domains, not cds
a_domains: list[bs_hmm.HSP] = []
Expand Down Expand Up @@ -431,8 +431,8 @@ def find_domain_lcs_protocluster(
if not isinstance(pair.record_b, bs_genbank.ProtoCluster):
raise TypeError("record_b must be a protocluster")

a_cds = pair.record_a.get_cds_with_domains()
b_cds = pair.record_b.get_cds_with_domains()
a_cds = list(pair.record_a.get_cds_with_domains())
b_cds = list(pair.record_b.get_cds_with_domains())

# working on domains, not cds
a_domains: list[bs_hmm.HSP] = []
Expand Down
23 changes: 12 additions & 11 deletions big_scape/comparison/record_pair.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ def __init__(self, record_a: BGCRecord, record_b: BGCRecord):
raise ValueError("Region in pair has no parent GBK!")

# comparable regions start "deflated", meaning only CDS with domains
a_len = len(record_a.get_cds_with_domains())
b_len = len(record_b.get_cds_with_domains())
a_domain_len = len(record_a.get_hsps())
b_domain_len = len(record_b.get_hsps())
a_len = len(list(record_a.get_cds_with_domains()))
b_len = len(list(record_b.get_cds_with_domains()))
a_domain_len = len(list(record_a.get_hsps()))
b_domain_len = len(list(record_b.get_hsps()))

self.comparable_region: ComparableRegion = ComparableRegion(
0, a_len, 0, b_len, 0, a_domain_len, 0, b_domain_len, False
Expand Down Expand Up @@ -128,10 +128,11 @@ def get_domain_lists(

reverse = self.comparable_region.reverse

a_cds_list = self.record_a.get_cds_with_domains()[a_start:a_stop]
b_cds_list = self.record_b.get_cds_with_domains(reverse=reverse)[
b_start:b_stop
]
cds_list_a = list(self.record_a.get_cds_with_domains())
a_cds_list = cds_list_a[a_start:a_stop]

cds_list_b = list(self.record_b.get_cds_with_domains(reverse=reverse))
b_cds_list = cds_list_b[b_start:b_stop]

a_domain_list: list[HSP] = []
for a_cds in a_cds_list:
Expand Down Expand Up @@ -199,9 +200,9 @@ def log_comparable_region(self, label="<") -> None: # pragma: no cover
if logging.getLogger().level > logging.DEBUG:
return

a_cds_list = self.record_a.get_cds_with_domains()
b_cds_list = self.record_b.get_cds_with_domains(
reverse=self.comparable_region.reverse
a_cds_list = list(self.record_a.get_cds_with_domains())
b_cds_list = list(
self.record_b.get_cds_with_domains(reverse=self.comparable_region.reverse)
)

b_start = self.comparable_region.b_start
Expand Down
Loading
Loading