Merge branch 'dev' of github.com:medema-group/BiG-SCAPE into chore/mypy

medema-group · May 8, 2024 · 330bd79 · 330bd79
2 parents 15a5b51 + f69af09
commit 330bd79
Show file tree

Hide file tree

Showing 6 changed files with 32 additions and 123 deletions.
diff --git a/big_scape/cli/__init__.py b/big_scape/cli/__init__.py
@@ -5,7 +5,6 @@
     validate_input_mode,
     validate_binning_cluster_workflow,
     validate_binning_query_workflow,
-    validate_skip_hmmscan,
     validate_alignment_mode,
     validate_includelist,
     validate_gcf_cutoffs,
@@ -21,7 +20,6 @@
     "validate_input_mode",
     "validate_binning_cluster_workflow",
     "validate_binning_query_workflow",
-    "validate_skip_hmmscan",
     "validate_alignment_mode",
     "validate_includelist",
     "validate_gcf_cutoffs",

diff --git a/big_scape/cli/cli_common_options.py b/big_scape/cli/cli_common_options.py
@@ -56,6 +56,7 @@ def common_all(fn):
         ),
         # run parameters
         click.option(
+            "-l",
             "--label",
             default=None,
             type=str,
@@ -154,6 +155,7 @@ def common_cluster_query(fn):
         ),
         # TODO: adjust choices
         click.option(
+            "-m",
             "--mibig_version",
             type=str,
             required=False,
@@ -164,6 +166,7 @@ def common_cluster_query(fn):
             "folder is present.",
         ),
         click.option(
+            "-r",
             "--reference_dir",
             callback=validate_not_empty_dir,
             type=click.Path(
@@ -224,25 +227,7 @@ def common_cluster_query(fn):
             ),
         ),
         click.option(
-            "--force_hmmscan",
-            is_flag=True,
-            help=(
-                "Force domain prediction using hmmscan even if BiG-SCAPE finds "
-                "processed gbk files (e.g. to use a new version of Pfam)."
-            ),
-        ),
-        # TODO: can this be deleted? if database is given, hmmscan is skipped anyway?
-        click.option(
-            "--skip_hmmscan",
-            is_flag=True,
-            help=(
-                "Skip domain prediction using hmmscan. "
-                "BiG-SCAPE expects to find "
-                "a database of already processed gbks."
-            ),
-        ),
-        click.option(
-            # TODO: check if implemented
+            # TODO: implement
             "--domain_includelist_path",
             type=click.Path(
                 exists=True, dir_okay=False, file_okay=True, path_type=Path
@@ -331,6 +316,7 @@ def common_cluster_query(fn):
             help="Path to output profile file. (default: output_dir/).",
         ),
         click.option(
+            "-db",
             "--db_path",
             type=click.Path(path_type=Path, dir_okay=False),
             help="Path to sqlite db output file. (default: output_dir/data_sqlite.db).",

diff --git a/big_scape/cli/cli_validations.py b/big_scape/cli/cli_validations.py
@@ -322,23 +322,6 @@ def validate_binning_query_workflow(ctx) -> None:
         )
 
 
-def validate_skip_hmmscan(ctx) -> None:
-    """Validates whether a BiG-SCAPE db exists when running skip_hmm, which
-    requires already processed gbk files and hence a DB in output"""
-
-    if ctx.obj["skip_hmmscan"] and ctx.obj["db_path"] is None:
-        logging.error(
-            "Missing option '--db_path'."
-            "BiG-SCAPE database has not been given, skip_hmmscan requires "
-            "a DB of already processed gbk files."
-        )
-        raise click.UsageError(
-            "Missing option '--db_path'."
-            "BiG-SCAPE database has not been given, skip_hmmscan requires "
-            "a DB of already processed gbk files."
-        )
-
-
 def validate_pfam_path(ctx) -> None:
     """Validates whether a BiG-SCAPE db exists when pfam_path is not provided,
     which requires already processed gbk files and hence a DB in output"""

diff --git a/big_scape/cli/cluster_cli.py b/big_scape/cli/cluster_cli.py
@@ -12,7 +12,6 @@
 from .cli_validations import (
     validate_output_paths,
     validate_binning_cluster_workflow,
-    validate_skip_hmmscan,
     validate_pfam_path,
     set_start,
 )
@@ -58,7 +57,6 @@ def cluster(ctx, *args, **kwargs):
 
     # workflow validations
     validate_binning_cluster_workflow(ctx)
-    validate_skip_hmmscan(ctx)
     validate_pfam_path(ctx)
     validate_output_paths(ctx)
 

diff --git a/big_scape/cli/query_cli.py b/big_scape/cli/query_cli.py
@@ -13,7 +13,6 @@
 from .cli_common_options import common_all, common_cluster_query
 from .cli_validations import (
     validate_output_paths,
-    validate_skip_hmmscan,
     validate_query_bgc,
     validate_pfam_path,
     set_start,
@@ -27,7 +26,7 @@
 @common_all
 @common_cluster_query
 @click.option(
-    "--query_bgc_path",
+    "-q" "--query_bgc_path",
     type=click.Path(exists=True, dir_okay=False, file_okay=True, path_type=Path),
     required=True,
     callback=validate_query_bgc,
@@ -38,7 +37,7 @@
     ),
 )
 @click.option(
-    "--query_record_number",
+    "-n" "--query_record_number",
     type=int,
     required=False,
     help=(

diff --git a/big_scape/network/network.py b/big_scape/network/network.py
@@ -142,13 +142,9 @@ def generate_connected_components(
 
     distance_table = DB.get_table("distance")
 
-    if seed_record is not None:
-        edge = get_random_edge_seeded(
-            cutoff, edge_param_id, seed_record, temp_record_table
-        )
-
-    else:
-        edge = get_random_edge(cutoff, edge_param_id, bin_label, temp_record_table)
+    edge = get_random_edge(
+        cutoff, edge_param_id, bin_label, temp_record_table, seed_record
+    )
 
     # could be that we already generated all connected components
     if edge is None:
@@ -335,6 +331,7 @@ def get_random_edge(
     edge_param_id: int,
     bin_label: str,
     temp_record_table: Optional[Table] = None,
+    seed_record: Optional[BGCRecord] = None,
 ) -> Optional[tuple[int, int]]:
     """
     Get a random edge from the database that is not in any connected component
@@ -355,23 +352,37 @@ def get_random_edge(
     distance_table = DB.get_table("distance")
     cc_table = DB.get_table("connected_component")
 
-    # this query is complicated, breaking it down:
-
     random_edge_query = (
         # select edge as just record ids
-        select(distance_table.c.record_a_id, distance_table.c.record_b_id)
-        # where record a id is not in a connected component with the same cutoff and edge param id
-        .where(
+        select(distance_table.c.record_a_id, distance_table.c.record_b_id).where(
+            # and where the edge has a distance less than the cutoff and the edge param id is the same
+            distance_table.c.distance < cutoff,
+            distance_table.c.edge_param_id == edge_param_id,
+        )
+        # return only one edge
+        .limit(1)
+    )
+
+    if seed_record is not None:
+        random_edge_query = random_edge_query.where(
+            or_(
+                distance_table.c.record_a_id == seed_record._db_id,
+                distance_table.c.record_b_id == seed_record._db_id,
+            )
+        )
+
+    else:
+        random_edge_query = random_edge_query.where(
+            # where record a id is not in a connected component with the same cutoff and edge param id
             distance_table.c.record_a_id.notin_(
                 select(cc_table.c.record_id).where(
                     cc_table.c.cutoff == cutoff,
                     cc_table.c.edge_param_id == edge_param_id,
                     cc_table.c.bin_label == bin_label,
                 )
             )
+        ).where(
             # and where record b id is not in a connected component with the same cutoff and edge param id
-        )
-        .where(
             distance_table.c.record_b_id.notin_(
                 select(cc_table.c.record_id).where(
                     cc_table.c.cutoff == cutoff,
@@ -381,72 +392,6 @@ def get_random_edge(
             ),
             # and where the edge has a distance less than the cutoff and the edge param id is the same
         )
-        .where(
-            distance_table.c.distance < cutoff,
-            distance_table.c.edge_param_id == edge_param_id,
-            # return only one edge
-        )
-        # return only one edge
-        .limit(1)
-    )
-
-    if temp_record_table is not None:
-        random_edge_query = random_edge_query.where(
-            and_(
-                distance_table.c.record_a_id.in_(select(temp_record_table.c.record_id)),
-                distance_table.c.record_b_id.in_(select(temp_record_table.c.record_id)),
-            )
-        )
-
-    edge = cast(tuple[int, int], DB.execute(random_edge_query).fetchone())
-
-    return edge
-
-
-def get_random_edge_seeded(
-    cutoff: float,
-    edge_param_id: int,
-    seed_record: BGCRecord,
-    temp_record_table: Optional[Table] = None,
-) -> Optional[tuple[int, int]]:
-    """
-    Get a random edge from the database where record a id or
-    record b id is the seed record
-    and has a distance less than the cutoff
-
-    Note that this returns only the ids to reduce the amount of data
-
-    Args:
-        cutoff: the distance cutoff
-        edge_param_id: the edge parameter id
-        temp_record_table (Table, optional): a temporary table with the records to include in the
-        connected component. Defaults to None.
-
-    Returns:
-        Optional[tuple[int, int]]: a tuple with the record ids of the edge or None
-    """
-    if DB.metadata is None:
-        raise RuntimeError("DB.metadata is None")
-    distance_table = DB.get_table("distance")
-
-    random_edge_query = (
-        # select edge as just record ids
-        select(distance_table.c.record_a_id, distance_table.c.record_b_id)
-        .where(
-            # where record a id or record b id is the seed record
-            or_(
-                distance_table.c.record_a_id == seed_record._db_id,
-                distance_table.c.record_b_id == seed_record._db_id,
-            )
-        )
-        .where(
-            # and where the edge has a distance less than the cutoff and the edge param id is the same
-            distance_table.c.distance < cutoff,
-            distance_table.c.edge_param_id == edge_param_id,
-        )
-        # return only one edge
-        .limit(1)
-    )
 
     if temp_record_table is not None:
         random_edge_query = random_edge_query.where(