diff --git a/big_scape/cli/cli_common_options.py b/big_scape/cli/cli_common_options.py index 0969b1cf..cf556b5c 100644 --- a/big_scape/cli/cli_common_options.py +++ b/big_scape/cli/cli_common_options.py @@ -273,21 +273,6 @@ def common_cluster_query(fn): "based on the generic 'mix' weights." ), ), - click.option( - "--legacy_classify", - is_flag=True, - help=( - "Does not use antiSMASH BGC classes to run analyses on " - "class-based bins, instead it uses BiG-SCAPE v1 predefined groups: " - "PKS1, PKSOther, NRPS, NRPS-PKS-hybrid, RiPP, Saccharide, Terpene, Others. " - "Will also use BiG-SCAPE v1 legacy_weights for distance calculations. " - "This feature is available for backwards compatibility with " - "antiSMASH versions up to v7. For higher antiSMASH versions, use " - "at your own risk, as BGC classes may have changed. All antiSMASH " - "classes that this legacy mode does not recognize will be grouped in " - "'others'." - ), - ), click.option( "--alignment_mode", type=click.Choice(["global", "glocal", "local", "auto"]), @@ -339,7 +324,7 @@ def common_cluster_query(fn): "-db", "--db_path", type=click.Path(path_type=Path, dir_okay=False), - help="Path to sqlite db output file. (default: output_dir/data_sqlite.db).", + help="Path to sqlite db output file. (default: output_dir/output_dir.db).", ), # TODO: implement cand_cluster here and LCS-ext click.option( diff --git a/big_scape/cli/cli_validations.py b/big_scape/cli/cli_validations.py index 1a4f0d92..796ce8bc 100644 --- a/big_scape/cli/cli_validations.py +++ b/big_scape/cli/cli_validations.py @@ -112,7 +112,7 @@ def validate_output_paths(ctx) -> None: timestamp = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()) if "db_path" in ctx.obj and ctx.obj["db_path"] is None: - db_path = ctx.obj["output_dir"] / Path("data_sqlite.db") + db_path = ctx.obj["output_dir"] / Path(f"{ctx.obj['output_dir'].name}.db") ctx.obj["db_path"] = db_path if "log_path" in ctx.obj and ctx.obj["log_path"] is None: diff --git a/big_scape/cli/cluster_cli.py b/big_scape/cli/cluster_cli.py index 373f4bbc..16b497a9 100644 --- a/big_scape/cli/cluster_cli.py +++ b/big_scape/cli/cluster_cli.py @@ -35,6 +35,21 @@ ) # binning parameters @click.option("--no_mix", is_flag=True, help=("Don't run the all-vs-all analysis.")) +@click.option( + "--legacy_classify", + is_flag=True, + help=( + "Does not use antiSMASH BGC classes to run analyses on " + "class-based bins, instead it uses BiG-SCAPE v1 predefined groups: " + "PKS1, PKSOther, NRPS, NRPS-PKS-hybrid, RiPP, Saccharide, Terpene, Others. " + "Will also use BiG-SCAPE v1 legacy_weights for distance calculations. " + "This feature is available for backwards compatibility with " + "antiSMASH versions up to v7. For higher antiSMASH versions, use " + "at your own risk, as BGC classes may have changed. All antiSMASH " + "classes that this legacy mode does not recognize will be grouped in " + "'others'." + ), +) # networking parameters @click.option( "--include_singletons", diff --git a/big_scape/cli/query_cli.py b/big_scape/cli/query_cli.py index 71fa20d6..011d9901 100644 --- a/big_scape/cli/query_cli.py +++ b/big_scape/cli/query_cli.py @@ -49,14 +49,13 @@ ), ) @click.option( - "--skip_propagation", + "--propagate", is_flag=True, help=( - "Only generate edges between the query and reference BGCs. If not set, " - "BiG-SCAPE will also propagate edge generation to reference BGCs. " - "Warning: if the database already contains all edges, this will not work, " - "and the output will still showcase all edges between nodes " - "in the query connected component." + "By default, BiG-SCAPE will only generate edges between the query and reference" + " BGCs. With the propagate flag, BiG-SCAPE will go through multiple cycles of " + "edge generation until no new reference BGCs are connected to the query " + "connected component." ), ) @click.pass_context @@ -74,6 +73,7 @@ def query(ctx, *args, **kwarg): ctx.obj.update(ctx.params) ctx.obj["no_mix"] = None ctx.obj["hybrids_off"] = False + ctx.obj["legacy_classify"] = False ctx.obj["mode"] = "Query" # workflow validations diff --git a/big_scape/comparison/binning.py b/big_scape/comparison/binning.py index 404b2ebc..b6bde73a 100644 --- a/big_scape/comparison/binning.py +++ b/big_scape/comparison/binning.py @@ -133,7 +133,7 @@ def num_pairs(self) -> int: # find a collection of gbks with more than one subrecord member_table = ( select(func.count(record_table.c.gbk_id).label("rec_count")) - .where(record_table.c.record_type == self.record_type.value) + .where(record_table.c.id.in_(self.record_ids)) .group_by(record_table.c.gbk_id) .having(func.count() > 1) .subquery() @@ -406,8 +406,9 @@ def __init__( label: str, edge_param_id: int, weights: str, + record_type: Optional[RECORD_TYPE], ): - super().__init__(label, edge_param_id, weights) + super().__init__(label, edge_param_id, weights, record_type) self.reference_records: set[BGCRecord] = set() self.done_records: set[BGCRecord] = set() self.working_query_records: set[BGCRecord] = set() @@ -426,6 +427,9 @@ def generate_pairs( if record_a == record_b: continue + if record_a.parent_gbk == record_b.parent_gbk: + continue + if legacy_sorting: sorted_a, sorted_b = sorted((record_a, record_b), key=sort_name_key) if sorted_a._db_id is None or sorted_b._db_id is None: @@ -455,6 +459,48 @@ def num_pairs(self) -> int: num_pairs = num_query_records * num_ref_records + # delete pairs originating from the same parent gbk + if self.record_type is not None and self.record_type != RECORD_TYPE.REGION: + query_ids = [record._db_id for record in self.working_query_records] + ref_ids = [record._db_id for record in self.working_ref_records] + + if DB.metadata is None: + raise RuntimeError("DB.metadata is None") + + rec_table = DB.metadata.tables["bgc_record"] + + # contruct two tables that hold the gbk id and the number of subrecords + # present in the set of query and ref records respectively + query_gbk = ( + select( + rec_table.c.gbk_id, + func.count(rec_table.c.gbk_id).label("query_count"), + ) + .where(rec_table.c.id.in_(query_ids)) + .group_by(rec_table.c.gbk_id) + .subquery() + ) + + ref_gbk = ( + select( + rec_table.c.gbk_id, + func.count(rec_table.c.gbk_id).label("ref_count"), + ) + .where(rec_table.c.id.in_(ref_ids)) + .group_by(rec_table.c.gbk_id) + .subquery() + ) + + # now we can join the two tables and obtain the number of links between + # records from the same gbks by multiplying their counts + same_gbk_query = select( + func.sum(query_gbk.c.query_count * ref_gbk.c.ref_count) + ).join(ref_gbk, query_gbk.c.gbk_id == ref_gbk.c.gbk_id) + + same_gbks = DB.execute(same_gbk_query).scalar_one() + if same_gbks: + num_pairs -= same_gbks + return num_pairs def add_records(self, record_list: list[BGCRecord]) -> None: diff --git a/big_scape/distances/query.py b/big_scape/distances/query.py index f2aaff56..eb6c31b9 100644 --- a/big_scape/distances/query.py +++ b/big_scape/distances/query.py @@ -43,7 +43,9 @@ def calculate_distances_query( max_cutoff = max(run["gcf_cutoffs"]) edge_param_id = bs_comparison.get_edge_param_id(run, weights) - query_bin = bs_comparison.QueryRecordPairGenerator("Query", edge_param_id, weights) + query_bin = bs_comparison.QueryRecordPairGenerator( + "Query", edge_param_id, weights, run["record_type"] + ) query_bin.add_records(query_records) missing_query_bin = bs_comparison.QueryMissingRecordPairGenerator(query_bin) @@ -54,9 +56,13 @@ def calculate_distances_query( query_connected_component = next( bs_network.get_connected_components( - max_cutoff, edge_param_id, query_bin, run["run_id"] - ) + max_cutoff, edge_param_id, query_bin, run["run_id"], query_record + ), + None, ) + if query_connected_component is None: + # no nodes are connected even with the highest cutoffs in the run + return query_bin query_nodes = bs_network.get_nodes_from_cc(query_connected_component, query_records) @@ -159,47 +165,56 @@ def calculate_distances(run: dict, bin: bs_comparison.RecordPairGenerator): # fetches the current number of singleton ref <-> connected ref pairs from the database num_pairs = bin.num_pairs() - # if there are no more singleton ref <-> connected ref pairs, then break and exit - if num_pairs == 0: - break - - logging.info("Calculating distances for %d pairs", num_pairs) + if num_pairs > 0: + logging.info("Calculating distances for %d pairs", num_pairs) - save_batch = [] - num_edges = 0 + save_batch = [] + num_edges = 0 - with tqdm.tqdm(total=num_pairs, unit="edge", desc="Calculating distances") as t: + with tqdm.tqdm( + total=num_pairs, unit="edge", desc="Calculating distances" + ) as t: - def callback(edges): - nonlocal num_edges - nonlocal save_batch - batch_size = run["cores"] * 100000 - for edge in edges: - num_edges += 1 - t.update(1) - save_batch.append(edge) - if len(save_batch) > batch_size: - bs_comparison.save_edges_to_db(save_batch, commit=True) - save_batch = [] + def callback(edges): + nonlocal num_edges + nonlocal save_batch + batch_size = run["cores"] * 100000 + for edge in edges: + num_edges += 1 + t.update(1) + save_batch.append(edge) + if len(save_batch) > batch_size: + bs_comparison.save_edges_to_db(save_batch, commit=True) + save_batch = [] - bs_comparison.generate_edges( - bin, - run["alignment_mode"], - run["extend_strategy"], - run["cores"], - run["cores"] * 2, - callback, - ) + bs_comparison.generate_edges( + bin, + run["alignment_mode"], + run["extend_strategy"], + run["cores"], + run["cores"] * 2, + callback, + ) - bs_comparison.save_edges_to_db(save_batch) + bs_comparison.save_edges_to_db(save_batch) - bs_data.DB.commit() + bs_data.DB.commit() - logging.info("Generated %d edges", num_edges) + logging.info("Generated %d edges", num_edges) - if run["skip_propagation"]: + if not run["propagate"]: # in this case we only want one iteration, the Query -> Ref edges break + if isinstance(bin, bs_comparison.MissingRecordPairGenerator): + # in this case we only need edges within one cc, no cycles needed + break + if isinstance(bin, bs_comparison.QueryMissingRecordPairGenerator): + # use the num_pairs from the parent bin because in a partial database, + # all distances for the first cycle(s) might already be present. + # we still only want to stop when no other connected nodes are discovered. + if bin.bin.num_pairs() == 0: + break + bin.cycle_records(max(run["gcf_cutoffs"])) diff --git a/big_scape/file_input/load_files.py b/big_scape/file_input/load_files.py index fddf08e2..bd2b17eb 100644 --- a/big_scape/file_input/load_files.py +++ b/big_scape/file_input/load_files.py @@ -409,21 +409,16 @@ def get_all_bgc_records_query( """Get all BGC records from the working list of GBKs Args: - gbks (list[GBK]): list of GBK objects run (dict): run parameters + gbks (list[GBK]): list of GBK objects Returns: - list[bs_gbk.BGCRecord]: list of BGC records + list[bs_gbk.BGCRecord], bs_gbk.BGCRecord: list of BGC records, query BGC record """ all_bgc_records: list[bs_gbk.BGCRecord] = [] for gbk in gbks: if gbk.region is not None: - gbk_records = bs_gbk.bgc_record.get_sub_records( - gbk.region, run["record_type"] - ) if gbk.source_type == bs_enums.SOURCE_TYPE.QUERY: - query_record_type = run["record_type"] - query_record_type = run["record_type"] query_record_number = run["query_record_number"] @@ -435,15 +430,24 @@ def get_all_bgc_records_query( query_record = query_sub_records[0] else: - query_record = [ + matching_query_records = [ record for record in query_sub_records if record.number == query_record_number - ][0] + ] + if len(matching_query_records) == 0: + raise RuntimeError( + f"Could not find {query_record_type.value} number {query_record_number} in query GBK. " + "Depending on config settings, overlapping records will be merged and take on the lower number." + ) + query_record = matching_query_records[0] all_bgc_records.append(query_record) else: + gbk_records = bs_gbk.bgc_record.get_sub_records( + gbk.region, run["record_type"] + ) all_bgc_records.extend(gbk_records) return all_bgc_records, query_record diff --git a/big_scape/network/families.py b/big_scape/network/families.py index bf07f71a..e5974d08 100644 --- a/big_scape/network/families.py +++ b/big_scape/network/families.py @@ -370,28 +370,18 @@ def run_family_assignments_query( # get_connected_components returns a list of connected components, but we only # want the first one, so we use next() - try: - query_connected_component = next( - bs_network.get_connected_components( - cutoff, query_bin.edge_param_id, query_bin, run["run_id"] - ) - ) - - cc_cutoff[cutoff] = query_connected_component - - logging.debug( - "Found connected component with %d edges", - len(query_connected_component), - ) - - regions_families = generate_families( - query_connected_component, query_bin.label, cutoff, run["run_id"] - ) - - # save families to database - save_to_db(regions_families) + query_connected_component = next( + bs_network.get_connected_components( + cutoff, + query_bin.edge_param_id, + query_bin, + run["run_id"], + query_record, + ), + None, + ) - except StopIteration: + if query_connected_component is None: logging.warning( "No connected components found for %s bin at cutoff %s", query_bin.label, @@ -399,6 +389,20 @@ def run_family_assignments_query( ) continue + cc_cutoff[cutoff] = query_connected_component + + logging.debug( + "Found connected component with %d edges", + len(query_connected_component), + ) + + regions_families = generate_families( + query_connected_component, query_bin.label, cutoff, run["run_id"] + ) + + # save families to database + save_to_db(regions_families) + DB.commit() # no connected components found diff --git a/big_scape/network/network.py b/big_scape/network/network.py index d228a379..0bfad3bb 100644 --- a/big_scape/network/network.py +++ b/big_scape/network/network.py @@ -32,6 +32,7 @@ def get_connected_components( edge_param_id: int, bin: bs_comparison.RecordPairGenerator, run_id: int, + seed_record: Optional[BGCRecord] = None, ) -> Generator[list[tuple[int, int, float, float, float, float, int]], None, None]: """Generate a network for each connected component in the network If a seed record is given, the connected component will be generated starting from that record @@ -54,7 +55,7 @@ def get_connected_components( # generate connected components using dfs generate_connected_components( - cutoff, edge_param_id, bin.label, run_id, include_record_table + cutoff, edge_param_id, bin.label, run_id, include_record_table, seed_record ) cc_ids = get_connected_component_ids( diff --git a/big_scape/output/html_template/output/html_content/css/style.css b/big_scape/output/html_template/output/html_content/css/style.css index 69db78e9..72d3d7ec 100644 --- a/big_scape/output/html_template/output/html_content/css/style.css +++ b/big_scape/output/html_template/output/html_content/css/style.css @@ -24,7 +24,6 @@ html[data-theme='dark'] { --overview-button-background: hsl(var(--hue), 34%, 48%); --button-text-color: hsl(var(--hue), 10%, 10%); --background-color: hsl(var(--hue), 20%, 12%); - /* --background-header: rgba(159, 159, 143, 0.188); */ --background-header: hsl(var(--hue), 15%, 20%); --li-selected-color: hsl(var(--hue), 70%, 60%); --row-selected-color: hsl(var(--hue), 60%, 70%) @@ -160,6 +159,7 @@ a.anchor { color: var(--text-color-normal); background-color: var(--background-color); overflow: scroll; + opacity: 90%; } .desc-container.active { diff --git a/big_scape/output/html_template/output/html_content/js/bigscape.js b/big_scape/output/html_template/output/html_content/js/bigscape.js index d3f5f91c..767641fa 100755 --- a/big_scape/output/html_template/output/html_content/js/bigscape.js +++ b/big_scape/output/html_template/output/html_content/js/bigscape.js @@ -360,11 +360,9 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n var ui = Viva.Graph.svg('circle') .attr('r', 10) .attr('fill', (fam_colors[bs_to_cl[node.id]])); - if (run_data["mode"] == "Cluster") { - if (bs_data[node.id]["source"] == "mibig" || bs_data[node.id]["source"] == "reference") { - ui.attr("stroke", "blue"); - ui.attr("stroke-width", "4px"); - } + if (bs_data[node.id]["source"] == "mibig" || bs_data[node.id]["source"] == "reference") { + ui.attr("stroke", "blue"); + ui.attr("stroke-width", "4px"); } if (run_data["mode"] == "Query") { if (bs_data[node.id]["source"] == "query") { diff --git a/big_scape/output/html_template/output/index.html b/big_scape/output/html_template/output/index.html index 37694256..33d5aeeb 100644 --- a/big_scape/output/html_template/output/index.html +++ b/big_scape/output/html_template/output/index.html @@ -228,8 +228,8 @@

Network

Filter the connected components table. All filters allow for comma separated input that will be handled as logical OR operators.
- - Fuzzy search organism names, e.g. "Streptomyces" + + Fuzzy search GBK descriptions, e.g. "Streptomyces"
@@ -392,7 +392,7 @@

Network

$("#cc-table-filters").toggleClass("hidden") }) $("#clear-filters").on("click", (event) => { - $("#organism-filter").val("") + $("#desc-filter").val("") $("#bgc-filter").val("") $("#domain-filter").val("") $("#family-filter").val("") @@ -558,7 +558,7 @@

Network

run_params["cutoff"] = cutoff var bin_label = $("#network-selection option:selected").text() - var org_filter = $("#organism-filter").val().toString().trim() + var org_filter = $("#desc-filter").val().toString().trim() var pf_filter = $("#domain-filter").val().toString().trim() var fam_filter = $("#family-filter").val().toString().trim() var bgc_filter = $("#bgc-filter").val().toString().trim() @@ -573,7 +573,7 @@

Network

AND connected_component.bin_label=="${bin_label}"` if (org_filter) { - var conditions = org_filter.split(",").map(org => `gbk.organism LIKE "%${org.trim()}%"`) + var conditions = org_filter.split(",").map(org => `gbk.description LIKE "%${org.trim()}%"`) cc_query += ` AND (${conditions.join(" OR ")})` } if (pf_filter) { @@ -647,7 +647,7 @@

Network

} var tsv_text = `# Collection of selected connected components # Run info: ${run_data["label"]} ${run_data["cutoff"]} ${$("#network-selection option:selected").text()} -# Organism filter: ${$("#organism-filter").val()} +# Description filter: ${$("#desc-filter").val()} # BGC filter: ${$("#bgc-filter").val()} # Pfam filter: ${$("#domain-filter").val()} # Family filter: ${$("#family-filter").val()}\n @@ -672,7 +672,7 @@

Network

if (!dom_data.hasOwnProperty(cds_id)) { dom_data[cds_id] = [] } - dom_data[cds_id].push({ "code": domains[i][1], "start": domains[i][2], "end": domains[i][3], "bitscore": domains[i][4] }) + dom_data[cds_id].push({ "code": domains[i][1], "start": domains[i][2], "end": domains[i][3], "bitscore": domains[i][4].toFixed(3) }) } return dom_data } @@ -693,7 +693,7 @@

Network

return cds_data } function generate_bs_data(record_ids, run_data) { - var records_query = `SELECT gbk.id, gbk.organism, length(gbk.nt_seq), gbk.hash, gbk.path, + var records_query = `SELECT gbk.id, gbk.description, length(gbk.nt_seq), gbk.hash, gbk.path, bgc_record.record_type, bgc_record.nt_start, bgc_record.nt_stop, bgc_record.record_number, bgc_record.id FROM bgc_record INNER JOIN gbk ON bgc_record.gbk_id==gbk.id @@ -733,17 +733,17 @@

Network

} new_row["record_start"] = rec_start new_row["record_stop"] = rec_stop + + if (gbk_path.indexOf(run_data["reference_dir"]) > -1) { + new_row["source"] = "reference" + } + if (gbk_path.indexOf(`mibig_antismash_${run_data["mibig_version"]}_gbk`) > -1) { + new_row["source"] = "mibig" + } if (run_data["mode"] == "Query") { if (filename == run_data["query_path"]) { new_row["source"] = "query" } - } else { - if (gbk_path.indexOf(run_data["reference_dir"]) > -1) { - new_row["source"] = "reference" - } - if (gbk_path.indexOf(run_data["mibig_version"]) > -1) { - new_row["source"] = "mibig" - } } bs_data[record_id] = new_row } diff --git a/test/comparison/test_binning.py b/test/comparison/test_binning.py index 730f8e31..ec62e475 100644 --- a/test/comparison/test_binning.py +++ b/test/comparison/test_binning.py @@ -161,7 +161,7 @@ def test_num_pairs_correct_with_query_ref(self): for bgc in bgc_list: bgc.save_record("Region") - new_bin = QueryRecordPairGenerator("test", 1, "mix") + new_bin = QueryRecordPairGenerator("test", 1, "mix", None) new_bin.add_records(bgc_list) @@ -290,7 +290,7 @@ def test_query_to_ref_pair_generator(self): create_mock_gbk(i, bs_enums.SOURCE_TYPE.REFERENCE) for i in range(1, 4) ] - query_to_ref_pair_generator = QueryRecordPairGenerator("mix", 1, "mix") + query_to_ref_pair_generator = QueryRecordPairGenerator("mix", 1, "mix", None) source_records = [query_gbk.region] for ref_gbk in ref_gbks: diff --git a/test/integration/test_comparison.py b/test/integration/test_comparison.py index c81c5e74..7ea0b6c1 100644 --- a/test/integration/test_comparison.py +++ b/test/integration/test_comparison.py @@ -1020,7 +1020,9 @@ def test_missing_query_pair_generator_first_iteration(self): create_mock_gbk(i, bs_enums.SOURCE_TYPE.REFERENCE) for i in range(1, 5) ] - query_pair_generator = bs_comparison.QueryRecordPairGenerator("mix", 1, "mix") + query_pair_generator = bs_comparison.QueryRecordPairGenerator( + "mix", 1, "mix", None + ) source_records = [query_gbk.region] for ref_gbk in ref_gbks: source_records.append(ref_gbk.region) @@ -1147,7 +1149,9 @@ def test_query_generators_workflow(self): create_mock_gbk(i, bs_enums.SOURCE_TYPE.REFERENCE) for i in range(1, 5) ] - query_pair_generator = bs_comparison.QueryRecordPairGenerator("mix", 1, "mix") + query_pair_generator = bs_comparison.QueryRecordPairGenerator( + "mix", 1, "mix", None + ) source_records = [query_gbk.region] for ref_gbk in ref_gbks: source_records.append(ref_gbk.region) @@ -1467,7 +1471,7 @@ def test_generate_bins_query_workflow(self): "record_type": bs_enums.RECORD_TYPE.REGION, "cores": 1, "classify": bs_enums.CLASSIFY_MODE.CLASS, - "skip_propagation": False, + "propagate": True, "gcf_cutoffs": [0.1, 0.7], } @@ -1488,7 +1492,7 @@ def test_generate_bins_query_workflow(self): # generate initial query -> ref pairs query_bin = bs_comparison.QueryRecordPairGenerator( - "Query", edge_param_id, weights + "Query", edge_param_id, weights, run["record_type"] ) query_bin.add_records(query_records) @@ -1603,14 +1607,16 @@ def test_calculate_distances_query(self): "record_type": bs_enums.RECORD_TYPE.REGION, "cores": 1, "classify": bs_enums.CLASSIFY_MODE.CLASS, - "skip_propagation": False, + "propagate": True, "run_id": 1, "gcf_cutoffs": [0.1, 0.8], } query_record, list_bgc_records = create_mock_query_dataset(run) - query_to_ref_bin = bs_comparison.QueryRecordPairGenerator("Query_Ref", 1, "mix") + query_to_ref_bin = bs_comparison.QueryRecordPairGenerator( + "Query_Ref", 1, "mix", run["record_type"] + ) query_records = bs_query.get_query_records(run, list_bgc_records, query_record) query_to_ref_bin.add_records(query_records)