Skip to content

Commit

Permalink
Merge branch 'dev' of github.com:medema-group/BiG-SCAPE into chore/mypy
Browse files Browse the repository at this point in the history
  • Loading branch information
adraismawur committed May 8, 2024
2 parents 15a5b51 + f69af09 commit 330bd79
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 123 deletions.
2 changes: 0 additions & 2 deletions big_scape/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
validate_input_mode,
validate_binning_cluster_workflow,
validate_binning_query_workflow,
validate_skip_hmmscan,
validate_alignment_mode,
validate_includelist,
validate_gcf_cutoffs,
Expand All @@ -21,7 +20,6 @@
"validate_input_mode",
"validate_binning_cluster_workflow",
"validate_binning_query_workflow",
"validate_skip_hmmscan",
"validate_alignment_mode",
"validate_includelist",
"validate_gcf_cutoffs",
Expand Down
24 changes: 5 additions & 19 deletions big_scape/cli/cli_common_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def common_all(fn):
),
# run parameters
click.option(
"-l",
"--label",
default=None,
type=str,
Expand Down Expand Up @@ -154,6 +155,7 @@ def common_cluster_query(fn):
),
# TODO: adjust choices
click.option(
"-m",
"--mibig_version",
type=str,
required=False,
Expand All @@ -164,6 +166,7 @@ def common_cluster_query(fn):
"folder is present.",
),
click.option(
"-r",
"--reference_dir",
callback=validate_not_empty_dir,
type=click.Path(
Expand Down Expand Up @@ -224,25 +227,7 @@ def common_cluster_query(fn):
),
),
click.option(
"--force_hmmscan",
is_flag=True,
help=(
"Force domain prediction using hmmscan even if BiG-SCAPE finds "
"processed gbk files (e.g. to use a new version of Pfam)."
),
),
# TODO: can this be deleted? if database is given, hmmscan is skipped anyway?
click.option(
"--skip_hmmscan",
is_flag=True,
help=(
"Skip domain prediction using hmmscan. "
"BiG-SCAPE expects to find "
"a database of already processed gbks."
),
),
click.option(
# TODO: check if implemented
# TODO: implement
"--domain_includelist_path",
type=click.Path(
exists=True, dir_okay=False, file_okay=True, path_type=Path
Expand Down Expand Up @@ -331,6 +316,7 @@ def common_cluster_query(fn):
help="Path to output profile file. (default: output_dir/).",
),
click.option(
"-db",
"--db_path",
type=click.Path(path_type=Path, dir_okay=False),
help="Path to sqlite db output file. (default: output_dir/data_sqlite.db).",
Expand Down
17 changes: 0 additions & 17 deletions big_scape/cli/cli_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,23 +322,6 @@ def validate_binning_query_workflow(ctx) -> None:
)


def validate_skip_hmmscan(ctx) -> None:
"""Validates whether a BiG-SCAPE db exists when running skip_hmm, which
requires already processed gbk files and hence a DB in output"""

if ctx.obj["skip_hmmscan"] and ctx.obj["db_path"] is None:
logging.error(
"Missing option '--db_path'."
"BiG-SCAPE database has not been given, skip_hmmscan requires "
"a DB of already processed gbk files."
)
raise click.UsageError(
"Missing option '--db_path'."
"BiG-SCAPE database has not been given, skip_hmmscan requires "
"a DB of already processed gbk files."
)


def validate_pfam_path(ctx) -> None:
"""Validates whether a BiG-SCAPE db exists when pfam_path is not provided,
which requires already processed gbk files and hence a DB in output"""
Expand Down
2 changes: 0 additions & 2 deletions big_scape/cli/cluster_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from .cli_validations import (
validate_output_paths,
validate_binning_cluster_workflow,
validate_skip_hmmscan,
validate_pfam_path,
set_start,
)
Expand Down Expand Up @@ -58,7 +57,6 @@ def cluster(ctx, *args, **kwargs):

# workflow validations
validate_binning_cluster_workflow(ctx)
validate_skip_hmmscan(ctx)
validate_pfam_path(ctx)
validate_output_paths(ctx)

Expand Down
5 changes: 2 additions & 3 deletions big_scape/cli/query_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from .cli_common_options import common_all, common_cluster_query
from .cli_validations import (
validate_output_paths,
validate_skip_hmmscan,
validate_query_bgc,
validate_pfam_path,
set_start,
Expand All @@ -27,7 +26,7 @@
@common_all
@common_cluster_query
@click.option(
"--query_bgc_path",
"-q" "--query_bgc_path",
type=click.Path(exists=True, dir_okay=False, file_okay=True, path_type=Path),
required=True,
callback=validate_query_bgc,
Expand All @@ -38,7 +37,7 @@
),
)
@click.option(
"--query_record_number",
"-n" "--query_record_number",
type=int,
required=False,
help=(
Expand Down
105 changes: 25 additions & 80 deletions big_scape/network/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,13 +142,9 @@ def generate_connected_components(

distance_table = DB.get_table("distance")

if seed_record is not None:
edge = get_random_edge_seeded(
cutoff, edge_param_id, seed_record, temp_record_table
)

else:
edge = get_random_edge(cutoff, edge_param_id, bin_label, temp_record_table)
edge = get_random_edge(
cutoff, edge_param_id, bin_label, temp_record_table, seed_record
)

# could be that we already generated all connected components
if edge is None:
Expand Down Expand Up @@ -335,6 +331,7 @@ def get_random_edge(
edge_param_id: int,
bin_label: str,
temp_record_table: Optional[Table] = None,
seed_record: Optional[BGCRecord] = None,
) -> Optional[tuple[int, int]]:
"""
Get a random edge from the database that is not in any connected component
Expand All @@ -355,23 +352,37 @@ def get_random_edge(
distance_table = DB.get_table("distance")
cc_table = DB.get_table("connected_component")

# this query is complicated, breaking it down:

random_edge_query = (
# select edge as just record ids
select(distance_table.c.record_a_id, distance_table.c.record_b_id)
# where record a id is not in a connected component with the same cutoff and edge param id
.where(
select(distance_table.c.record_a_id, distance_table.c.record_b_id).where(
# and where the edge has a distance less than the cutoff and the edge param id is the same
distance_table.c.distance < cutoff,
distance_table.c.edge_param_id == edge_param_id,
)
# return only one edge
.limit(1)
)

if seed_record is not None:
random_edge_query = random_edge_query.where(
or_(
distance_table.c.record_a_id == seed_record._db_id,
distance_table.c.record_b_id == seed_record._db_id,
)
)

else:
random_edge_query = random_edge_query.where(
# where record a id is not in a connected component with the same cutoff and edge param id
distance_table.c.record_a_id.notin_(
select(cc_table.c.record_id).where(
cc_table.c.cutoff == cutoff,
cc_table.c.edge_param_id == edge_param_id,
cc_table.c.bin_label == bin_label,
)
)
).where(
# and where record b id is not in a connected component with the same cutoff and edge param id
)
.where(
distance_table.c.record_b_id.notin_(
select(cc_table.c.record_id).where(
cc_table.c.cutoff == cutoff,
Expand All @@ -381,72 +392,6 @@ def get_random_edge(
),
# and where the edge has a distance less than the cutoff and the edge param id is the same
)
.where(
distance_table.c.distance < cutoff,
distance_table.c.edge_param_id == edge_param_id,
# return only one edge
)
# return only one edge
.limit(1)
)

if temp_record_table is not None:
random_edge_query = random_edge_query.where(
and_(
distance_table.c.record_a_id.in_(select(temp_record_table.c.record_id)),
distance_table.c.record_b_id.in_(select(temp_record_table.c.record_id)),
)
)

edge = cast(tuple[int, int], DB.execute(random_edge_query).fetchone())

return edge


def get_random_edge_seeded(
cutoff: float,
edge_param_id: int,
seed_record: BGCRecord,
temp_record_table: Optional[Table] = None,
) -> Optional[tuple[int, int]]:
"""
Get a random edge from the database where record a id or
record b id is the seed record
and has a distance less than the cutoff
Note that this returns only the ids to reduce the amount of data
Args:
cutoff: the distance cutoff
edge_param_id: the edge parameter id
temp_record_table (Table, optional): a temporary table with the records to include in the
connected component. Defaults to None.
Returns:
Optional[tuple[int, int]]: a tuple with the record ids of the edge or None
"""
if DB.metadata is None:
raise RuntimeError("DB.metadata is None")
distance_table = DB.get_table("distance")

random_edge_query = (
# select edge as just record ids
select(distance_table.c.record_a_id, distance_table.c.record_b_id)
.where(
# where record a id or record b id is the seed record
or_(
distance_table.c.record_a_id == seed_record._db_id,
distance_table.c.record_b_id == seed_record._db_id,
)
)
.where(
# and where the edge has a distance less than the cutoff and the edge param id is the same
distance_table.c.distance < cutoff,
distance_table.c.edge_param_id == edge_param_id,
)
# return only one edge
.limit(1)
)

if temp_record_table is not None:
random_edge_query = random_edge_query.where(
Expand Down

0 comments on commit 330bd79

Please sign in to comment.