From e2c6122ba2985af0b31e1f49fc59602c39814e5e Mon Sep 17 00:00:00 2001 From: "Arjan Draisma (wur)" <74908173+adraismawur@users.noreply.github.com> Date: Tue, 14 Nov 2023 13:58:38 +0100 Subject: [PATCH] Revert "Feature/input subset" --- big_scape/data/partial_task.py | 36 ++++---- big_scape/data/schema.sql | 3 +- big_scape/enums/partial_task.py | 2 +- big_scape/file_input/load_files.py | 30 +++---- big_scape/genbank/candidate_cluster.py | 1 - big_scape/genbank/cds.py | 1 - big_scape/genbank/gbk.py | 77 ++++++++++++------ big_scape/genbank/proto_cluster.py | 1 - big_scape/genbank/proto_core.py | 1 - big_scape/genbank/region.py | 1 - .../output/html_content/js/bigscape.js | 4 +- big_scape/output/legacy_output.py | 6 +- test/comparison/test_binning.py | 30 +++---- test/comparison/test_comparable_region.py | 16 ++-- test/comparison/test_lcs.py | 8 +- test/comparison/test_scores.py | 28 +++---- test/db/test_partial.py | 6 +- test/genbank/test_bgc_record.py | 8 +- test/genbank/test_gbk.py | 32 +------- test/hmm/test_hmm_align.py | 2 +- test/hmm/test_hmm_scan.py | 2 +- test/network/test_network.py | 4 +- test/test_data/database/valid_family.db | Bin 417792 -> 417792 bytes test/test_data/database/valid_populated.db | Bin 4325376 -> 4325376 bytes test/trees/test_alignment.py | 24 +++--- 25 files changed, 150 insertions(+), 173 deletions(-) diff --git a/big_scape/data/partial_task.py b/big_scape/data/partial_task.py index 2a010727..a8942b9c 100644 --- a/big_scape/data/partial_task.py +++ b/big_scape/data/partial_task.py @@ -29,14 +29,8 @@ def find_minimum_task(gbks: list[GBK]): """ input_data_state = get_input_data_state(gbks) - # new data or mixed data - if ( - input_data_state.value == bs_enums.INPUT_TASK.NEW_DATA.value - or input_data_state.value == bs_enums.INPUT_TASK.MIXED_DATA.value - or input_data_state.value == bs_enums.INPUT_TASK.NO_DATA.value - ): - # gbks from input need to be loaded into the in-memory database - return bs_enums.TASK.SAVE_GBKS + if input_data_state.value < bs_enums.INPUT_TASK.SAME_DATA.value: + return bs_enums.TASK.LOAD_GBKS hmm_data_state = get_hmm_data_state(gbks) @@ -68,20 +62,20 @@ def get_input_data_state(gbks: list[GBK]) -> bs_enums.INPUT_TASK: # get set of gbks in database db_gbk_rows = DB.execute(gbk_table.select()).all() - db_gbk_hashes: set[str] = {db_gbk_row[2] for db_gbk_row in db_gbk_rows} - input_gbk_hashes: set[str] = {str(gbk.hash) for gbk in gbks} + db_gbk_paths: set[str] = {db_gbk_row[1] for db_gbk_row in db_gbk_rows} + input_gbk_paths: set[str] = {str(gbk.path) for gbk in gbks} - if db_gbk_hashes == input_gbk_hashes: + if db_gbk_paths == input_gbk_paths: return bs_enums.INPUT_TASK.SAME_DATA - union = db_gbk_hashes & input_gbk_hashes + sym_dif = db_gbk_paths.symmetric_difference(input_gbk_paths) - # all new data - if len(union) == 0: + # still same amount in db. new data + if len(sym_dif) == len(db_gbk_paths): return bs_enums.INPUT_TASK.NEW_DATA - # only partial data which is already in database - if len(union) == len(input_gbk_hashes): + # same amount in new data. there was more in db than in new data + if len(sym_dif) == len(input_gbk_paths): return bs_enums.INPUT_TASK.PARTIAL_DATA # otherwise there is some new data, some old data is missing @@ -98,7 +92,7 @@ def get_missing_gbks(gbks: list[GBK]) -> list[GBK]: list[GBK]: List of GBKs that are missing from the database """ # dictionary of gbk path to gbk object - gbk_dict = {str(gbk.hash): gbk for gbk in gbks} + gbk_dict = {str(gbk.path): gbk for gbk in gbks} if not DB.metadata: raise RuntimeError("DB.metadata is None") @@ -107,13 +101,13 @@ def get_missing_gbks(gbks: list[GBK]) -> list[GBK]: # get set of gbks in database db_gbk_rows = DB.execute(gbk_table.select()).all() - db_gbk_hashes: set[int] = {db_gbk_row[2] for db_gbk_row in db_gbk_rows} + db_gbk_paths: set[int] = {db_gbk_row[1] for db_gbk_row in db_gbk_rows} missing_gbks = [] - for gbk_hash in gbk_dict: - if gbk_hash not in db_gbk_hashes: - missing_gbks.append(gbk_dict[gbk_hash]) + for gbk_path in gbk_dict: + if gbk_path not in db_gbk_paths: + missing_gbks.append(gbk_dict[gbk_path]) return missing_gbks diff --git a/big_scape/data/schema.sql b/big_scape/data/schema.sql index 62325ba9..b04bebaf 100644 --- a/big_scape/data/schema.sql +++ b/big_scape/data/schema.sql @@ -5,12 +5,11 @@ CREATE TABLE IF NOT EXISTS gbk ( id INTEGER PRIMARY KEY AUTOINCREMENT, path TEXT, - hash TEXT, nt_seq TEXT, organism TEXT, taxonomy TEXT, description TEXT, - UNIQUE(hash) + UNIQUE(path) ); CREATE TABLE IF NOT EXISTS bgc_record ( diff --git a/big_scape/enums/partial_task.py b/big_scape/enums/partial_task.py index b2f14259..8df3d788 100644 --- a/big_scape/enums/partial_task.py +++ b/big_scape/enums/partial_task.py @@ -4,7 +4,7 @@ class TASK(Enum): - SAVE_GBKS = 0 + LOAD_GBKS = 0 HMM_SCAN = 1 HMM_ALIGN = 2 COMPARISON = 3 diff --git a/big_scape/file_input/load_files.py b/big_scape/file_input/load_files.py index 803ff481..33f31ec6 100644 --- a/big_scape/file_input/load_files.py +++ b/big_scape/file_input/load_files.py @@ -290,20 +290,20 @@ def load_gbks(run: dict, bigscape_dir: Path) -> list[GBK]: bs_data.DB.load_from_disk(run["db_path"]) task_state = bs_data.find_minimum_task(input_gbks) - source_dict = {gbk.hash: gbk.source_type for gbk in input_gbks} - - # if we are are not on the save_gbks task, we have all the data we need in the database - # and can just load it all into the correct python objects - if task_state != bs_enums.TASK.SAVE_GBKS: - # here we dont save anything to DB, data goes DB -> python objects + # if we are are not on the load_gbks task, we have all the data we need + if task_state != bs_enums.TASK.LOAD_GBKS: logging.info("Loading existing run from disk...") - input_gbks_from_db = GBK.load_many(input_gbks) - for gbk in input_gbks_from_db: - gbk.source_type = source_dict[gbk.hash] + source_dict = {gbk.path: gbk.source_type for gbk in input_gbks} + + gbks_from_db = GBK.load_all() + for gbk in gbks_from_db: + gbk.source_type = source_dict[gbk.path] + + for gbk in gbks_from_db: bs_hmm.HSP.load_all(gbk.genes) - return input_gbks_from_db + return gbks_from_db # if we end up here, we are in some halfway state and need to load in the new data logging.info("Loading existing run from disk and adding new data...") @@ -313,11 +313,5 @@ def load_gbks(run: dict, bigscape_dir: Path) -> list[GBK]: for gbk in missing_gbks: gbk.save_all() - # now we have all new data in the database, we can load it all in to the correct - # python objects - input_gbks_from_db = GBK.load_many(input_gbks) - for gbk in input_gbks_from_db: - gbk.source_type = source_dict[gbk.hash] - bs_hmm.HSP.load_all(gbk.genes) - - return input_gbks_from_db + # still return the full set + return input_gbks diff --git a/big_scape/genbank/candidate_cluster.py b/big_scape/genbank/candidate_cluster.py index a5689578..576b920d 100644 --- a/big_scape/genbank/candidate_cluster.py +++ b/big_scape/genbank/candidate_cluster.py @@ -188,7 +188,6 @@ def load_all(region_dict: dict[int, Region]): record_table.c.product, ) .where(record_table.c.record_type == "cand_cluster") - .where(record_table.c.parent_id.in_(region_dict.keys())) .compile() ) diff --git a/big_scape/genbank/cds.py b/big_scape/genbank/cds.py index f3e0bc15..72c4650a 100644 --- a/big_scape/genbank/cds.py +++ b/big_scape/genbank/cds.py @@ -354,7 +354,6 @@ def load_all(gbk_dict: dict[int, GBK]) -> None: cds_table.c.aa_seq, ) .order_by(cds_table.c.orf_num) - .where(cds_table.c.gbk_id.in_(gbk_dict.keys())) .compile() ) diff --git a/big_scape/genbank/gbk.py b/big_scape/genbank/gbk.py index 22489756..7a43a562 100644 --- a/big_scape/genbank/gbk.py +++ b/big_scape/genbank/gbk.py @@ -7,7 +7,6 @@ # from enum import Enum from pathlib import Path from typing import Dict, Optional -import hashlib # from dependencies @@ -48,9 +47,8 @@ class GBK: source_type: SOURCE_TYPE """ - def __init__(self, path, hash, source_type) -> None: + def __init__(self, path, source_type) -> None: self.path: Path = path - self.hash: str = hash self.metadata: Dict[str, str] = {} self.region: Optional[Region] = None self.nt_seq: SeqRecord.seq = None @@ -160,7 +158,6 @@ def save(self, commit=True) -> None: gbk_table.insert() .values( path=str(self.path), - hash=str(self.hash), nt_seq=str(self.nt_seq), organism=organism, taxonomy=taxonomy, @@ -217,7 +214,6 @@ def load_all() -> list[GBK]: gbk_table.select() .add_columns( gbk_table.c.id, - gbk_table.c.hash, gbk_table.c.path, gbk_table.c.nt_seq, gbk_table.c.organism, @@ -231,7 +227,7 @@ def load_all() -> list[GBK]: gbk_dict = {} for result in cursor_result.all(): - new_gbk = GBK(Path(result.path), result.hash, "") + new_gbk = GBK(Path(result.path), "") new_gbk._db_id = result.id new_gbk.nt_seq = result.nt_seq new_gbk.metadata["organism"] = result.organism @@ -249,7 +245,51 @@ def load_all() -> list[GBK]: return list(gbk_dict.values()) @staticmethod - def load_many(input_gbks: list[GBK]) -> list[GBK]: + def load_one(gbk_id: int) -> GBK: + """Load a single GBK object from the database + + Args: + gbk_id (int): id of gbk to load + + Returns: + GBK: loaded GBK object + """ + + if not DB.metadata: + raise RuntimeError("DB.metadata is None") + + gbk_table = DB.metadata.tables["gbk"] + select_query = ( + gbk_table.select() + .add_columns( + gbk_table.c.id, + gbk_table.c.path, + gbk_table.c.source_type, + gbk_table.c.nt_seq, + gbk_table.c.organism, + gbk_table.c.taxonomy, + gbk_table.c.description, + ) + .where(gbk_table.c.id == gbk_id) + .compile() + ) + + result = DB.execute(select_query).fetchone() + + if result is None: + raise RuntimeError(f"No GBK with id {gbk_id}") + + new_gbk = GBK(Path(result.path), result.source_type) + new_gbk._db_id = result.id + new_gbk.nt_seq = result.nt_seq + new_gbk.metadata["organism"] = result.organism + new_gbk.metadata["taxonomy"] = result.taxonomy + new_gbk.metadata["description"] = result.description + + return new_gbk + + @staticmethod + def load_many(gbk_ids: list[int]) -> list[GBK]: """Load a list of GBK objects from the database Args: @@ -259,8 +299,6 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]: list[GBK]: loaded GBK objects """ - input_gbk_hashes = [gbk.hash for gbk in input_gbks] - if not DB.metadata: raise RuntimeError("DB.metadata is None") @@ -269,14 +307,14 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]: gbk_table.select() .add_columns( gbk_table.c.id, - gbk_table.c.hash, gbk_table.c.path, + gbk_table.c.source_type, gbk_table.c.nt_seq, gbk_table.c.organism, gbk_table.c.taxonomy, gbk_table.c.description, ) - .where(gbk_table.c.hash.in_(input_gbk_hashes)) + .where(gbk_table.c.id.in_(gbk_ids)) .compile() ) @@ -284,7 +322,7 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]: gbk_dict = {} for result in cursor_result.all(): - new_gbk = GBK(Path(result.path), result.hash, "") + new_gbk = GBK(Path(result.path), result.source_type) new_gbk._db_id = result.id new_gbk.nt_seq = result.nt_seq new_gbk.metadata["organism"] = result.organism @@ -342,14 +380,7 @@ def parse( GBK: GBK object """ - # get unique content hash - f = open(path, "r") - data = f.read() - f.close() - data = data.encode("utf-8") # type: ignore - hash = hashlib.sha256(data).hexdigest() # type: ignore - - gbk = cls(path, hash, source_type) + gbk = cls(path, source_type) # get record. should only ever be one for Antismash GBK record: SeqRecord = next(SeqIO.parse(path, "genbank")) @@ -559,13 +590,13 @@ def __repr__(self) -> str: return f"GBK {self.path.name}, {len(self.genes)} genes" def __hash__(self) -> int: - return hash(self.hash) + return hash(self.path) def __eq__(self, other) -> bool: if not isinstance(other, GBK): return False - if self.hash is None or other.hash is None: + if self.path is None or other.path is None: return False - return self.hash == other.hash + return self.path == other.path diff --git a/big_scape/genbank/proto_cluster.py b/big_scape/genbank/proto_cluster.py index 918bc926..328330ef 100644 --- a/big_scape/genbank/proto_cluster.py +++ b/big_scape/genbank/proto_cluster.py @@ -204,7 +204,6 @@ def load_all(candidate_cluster_dict: dict[int, CandidateCluster]): record_table.c.category, ) .where(record_table.c.record_type == "protocluster") - .where(record_table.c.parent_id.in_(candidate_cluster_dict.keys())) .compile() ) diff --git a/big_scape/genbank/proto_core.py b/big_scape/genbank/proto_core.py index b8e3b5b4..bcbfd879 100644 --- a/big_scape/genbank/proto_core.py +++ b/big_scape/genbank/proto_core.py @@ -135,7 +135,6 @@ def load_all(protocluster_dict: dict[int, ProtoCluster]): record_table.c.category, ) .where(record_table.c.record_type == "proto_core") - .where(record_table.c.parent_id.in_(protocluster_dict.keys())) .compile() ) diff --git a/big_scape/genbank/region.py b/big_scape/genbank/region.py index 6a93654a..c47ede07 100644 --- a/big_scape/genbank/region.py +++ b/big_scape/genbank/region.py @@ -241,7 +241,6 @@ def load_all(gbk_dict: dict[int, GBK]) -> None: record_table.c.product, ) .where(record_table.c.record_type == "region") - .where(record_table.c.gbk_id.in_(gbk_dict.keys())) .compile() ) diff --git a/big_scape/output/html_template/output/html_content/js/bigscape.js b/big_scape/output/html_template/output/html_content/js/bigscape.js index 9947fe2e..bd0c1997 100644 --- a/big_scape/output/html_template/output/html_content/js/bigscape.js +++ b/big_scape/output/html_template/output/html_content/js/bigscape.js @@ -323,7 +323,7 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n // construct the graph for (var i = 0; i < bs_data.length; i++) { var bs_obj = bs_data[i]; - graph.addNode(i, { id: bs_obj["id"], hash: bs_obj["hash"], cl: bs_to_cl[i] }); + graph.addNode(i, { id: bs_obj["id"], cl: bs_to_cl[i] }); } for (var a = 0; a < bs_data.length; a++) { for (var b = 0; b < bs_data.length; b++) { @@ -449,7 +449,7 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n .attr("stroke", "#777") .attr("stroke-width", link["data"]["weight"] * 10); - if (graph.getNode(link.fromId).data.hash === graph.getNode(link.toId).data.hash) { + if (graph.getNode(link.fromId).data.id === graph.getNode(link.toId).data.id) { line = line.attr("stroke-dasharray", "10,10") } return line diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py index 4f0a1d88..a7f322a5 100644 --- a/big_scape/output/legacy_output.py +++ b/big_scape/output/legacy_output.py @@ -576,7 +576,6 @@ def generate_bs_data_js( "start: int, "end": int, "id": str, (e.g. AL645882.2.cluster010), - "hash": str, "mibig": bool, "source": str, (e.g. mibig, reference, or query), "record_start": int, (e.g. cds boundaries of protocluster, index starts at 1) @@ -630,10 +629,7 @@ def generate_bs_data_js( "desc": organism, "start": 1, "end": len(gbk.nt_seq), - "id": "_".join( - [gbk.path.name, type(record).__name__.lower(), str(record.number)] - ), - "hash": gbk.hash, + "id": gbk.path.name, "mibig": gbk.source_type == SOURCE_TYPE.MIBIG, "source": gbk.source_type.name.lower(), "record_start": rec_start, diff --git a/test/comparison/test_binning.py b/test/comparison/test_binning.py index 314f99bb..44d1379e 100644 --- a/test/comparison/test_binning.py +++ b/test/comparison/test_binning.py @@ -31,7 +31,7 @@ def create_mock_gbk(i, source_type: bs_enums.SOURCE_TYPE) -> GBK: - gbk = GBK(Path(f"test_path_{i}.gbk"), str(i), source_type) + gbk = GBK(Path(f"test_path_{i}.gbk"), source_type) cds = CDS(0, 100) cds.parent_gbk = gbk cds.orf_num = 1 @@ -63,7 +63,7 @@ def test_pair_repr(self): """Tests whether calling str() on a bin object returns an expected string representation of the object """ - gbk = GBK(Path("test"), "test", bs_enums.SOURCE_TYPE.QUERY) + gbk = GBK(Path("test"), bs_enums.SOURCE_TYPE.QUERY) bgc_a = BGCRecord(gbk, 0, 0, 10, False, "") bgc_b = BGCRecord(gbk, 0, 10, 20, False, "") @@ -83,7 +83,7 @@ def test_pair_no_parent_gbk(self): """Tests whether initialization of a BGC pair where one of the BGCs does not have a parent GBK correctly throws a ValueError """ - gbk = GBK("", "", "test") + gbk = GBK("", "") bgc_a = BGCRecord(gbk, 0, 0, 10, False, "") @@ -108,7 +108,7 @@ def test_bin_repr(self): """Tests whether calling str() on a bin object returns an expected string representation of the object """ - parent_gbk = GBK(Path("test"), "test", source_type=bs_enums.SOURCE_TYPE.QUERY) + parent_gbk = GBK(Path("test"), source_type=bs_enums.SOURCE_TYPE.QUERY) bgc_a = BGCRecord(parent_gbk, 0, 0, 10, False, "") bgc_b = BGCRecord(parent_gbk, 0, 0, 10, False, "") bgc_c = BGCRecord(parent_gbk, 0, 0, 10, False, "") @@ -129,7 +129,7 @@ def test_bin_repr(self): def test_num_pairs_too_few_records(self): """tests if bin.num_pairs() correctly returns 0 if there is only one record in the bin""" - gbk_a = GBK(Path("test1.gbk"), "test", "test") + gbk_a = GBK(Path("test1.gbk"), "test") bgc_a = BGCRecord(gbk_a, 0, 0, 10, False, "") new_bin = RecordPairGenerator("test", 1) @@ -144,12 +144,8 @@ def test_num_pairs_too_few_records(self): def test_num_pairs_correct_with_query_ref(self): """Tests whether bin.num_pairs() correctly returns all query and ref but not ref <-> ref pairs""" - parent_gbk_query = GBK( - Path("test"), "test", source_type=bs_enums.SOURCE_TYPE.QUERY - ) - parent_gbk_ref = GBK( - Path("test"), "test", source_type=bs_enums.SOURCE_TYPE.REFERENCE - ) + parent_gbk_query = GBK(Path("test"), source_type=bs_enums.SOURCE_TYPE.QUERY) + parent_gbk_ref = GBK(Path("test"), source_type=bs_enums.SOURCE_TYPE.REFERENCE) bgc_a = BGCRecord(parent_gbk_query, 0, 0, 10, False, "") bgc_b = BGCRecord(parent_gbk_query, 0, 0, 10, False, "") bgc_c = BGCRecord(parent_gbk_ref, 0, 0, 10, False, "") @@ -169,11 +165,11 @@ def test_num_pairs_correct_with_query_ref(self): def test_legacy_sorting(self): """Tests whether the legacy sorting option in bin.pairs() correctly orders the pairs""" - gbk_a = GBK(Path("test1.gbk"), "test", "test") + gbk_a = GBK(Path("test1.gbk"), "test") bgc_a = BGCRecord(gbk_a, 0, 0, 10, False, "") - gbk_b = GBK(Path("test2.gbk"), "test", "test") + gbk_b = GBK(Path("test2.gbk"), "test") bgc_b = BGCRecord(gbk_b, 0, 0, 10, False, "") - gbk_c = GBK(Path("test3.gbk"), "test", "test") + gbk_c = GBK(Path("test3.gbk"), "test") bgc_c = BGCRecord(gbk_c, 0, 0, 10, False, "") # due to the order, this should generate a list of pairs as follows without legacy sort: @@ -734,7 +730,7 @@ def test_cull_singletons_cutoff(self): class TestMixComparison(TestCase): def test_mix_iter(self): """Tests whether a new mix bin can be created for comparison""" - gbk = GBK(Path("test"), "test", source_type=bs_enums.SOURCE_TYPE.QUERY) + gbk = GBK(Path("test"), source_type=bs_enums.SOURCE_TYPE.QUERY) bgc_a = BGCRecord(gbk, 0, 0, 10, False, "") bgc_a.parent_gbk = gbk @@ -844,7 +840,7 @@ def test_as_class_bin_generator(self): "Bin 'PKS': 1 pairs from 2 BGC records" bs_data.DB.create_in_mem() - gbk_1 = GBK(Path("test"), "test", source_type=bs_enums.SOURCE_TYPE.QUERY) + gbk_1 = GBK(Path("test"), source_type=bs_enums.SOURCE_TYPE.QUERY) gbk_1.region = mock_region() region_1 = gbk_1.region @@ -852,7 +848,7 @@ def test_as_class_bin_generator(self): protocluster_1 = cand_cluster_1.proto_clusters[1] protocore_1 = protocluster_1.proto_core[1] - gbk_2 = GBK(Path("test"), "test", source_type=bs_enums.SOURCE_TYPE.QUERY) + gbk_2 = GBK(Path("test"), source_type=bs_enums.SOURCE_TYPE.QUERY) gbk_2.region = mock_region() region_2 = gbk_2.region diff --git a/test/comparison/test_comparable_region.py b/test/comparison/test_comparable_region.py index 72178c63..0828aff6 100644 --- a/test/comparison/test_comparable_region.py +++ b/test/comparison/test_comparable_region.py @@ -27,7 +27,7 @@ def test_get_dom_list_lcs(self): "PF00005", ] - gbk_a = GBK("", "", "") + gbk_a = GBK("", "") gbk_a.region = Region(gbk_a, 0, 0, 100, False, "") for a_domain in a_domains: @@ -35,7 +35,7 @@ def test_get_dom_list_lcs(self): gbk_a.genes.append(cds_a) cds_a.hsps.append(HSP(cds_a, a_domain, 100, 0, 30)) - gbk_b = GBK("", "", "") + gbk_b = GBK("", "") gbk_b.region = Region(gbk_b, 0, 0, 100, False, "") for b_domain in b_domains: @@ -77,7 +77,7 @@ def test_get_dom_list_lcs_reverse(self): "PF00004", ] - gbk_a = GBK("", "", "") + gbk_a = GBK("", "") gbk_a.region = Region(gbk_a, 0, 0, 100, False, "") for a_domain in a_domains: @@ -85,7 +85,7 @@ def test_get_dom_list_lcs_reverse(self): gbk_a.genes.append(cds_a) cds_a.hsps.append(HSP(cds_a, a_domain, 100, 0, 30)) - gbk_b = GBK("", "", "") + gbk_b = GBK("", "") gbk_b.region = Region(gbk_b, 0, 0, 100, False, "") for b_domain in b_domains: @@ -144,7 +144,7 @@ def test_cds_range_contains_biosyntetic_true(self): record in which a region contains a biosynthetic gene """ - gbk = GBK(None, "test", "test") + gbk = GBK(None, "test") record = Region(gbk, 0, 0, 0, False, "") @@ -181,7 +181,7 @@ def test_cds_range_contains_biosyntetic_false(self): record in which a region contains a biosynthetic gene """ - gbk = GBK(None, "test", "test") + gbk = GBK(None, "test") record = Region(gbk, 0, 0, 0, False, "") @@ -225,7 +225,7 @@ def test_get_domain_dicts(self): "PF00004", ] - gbk_a = GBK("", "", "") + gbk_a = GBK("", "") gbk_a.region = Region(gbk_a, 0, 0, 100, False, "") for a_domain in a_domains: @@ -235,7 +235,7 @@ def test_get_domain_dicts(self): env_stop = env_start + 10 cds_a.hsps.append(HSP(cds_a, a_domain, 100, env_start, env_stop)) - gbk_b = GBK("", "", "") + gbk_b = GBK("", "") gbk_b.region = Region(gbk_b, 0, 0, 100, False, "") for b_domain in b_domains: diff --git a/test/comparison/test_lcs.py b/test/comparison/test_lcs.py index b63b9a70..db0b3219 100644 --- a/test/comparison/test_lcs.py +++ b/test/comparison/test_lcs.py @@ -78,7 +78,7 @@ def generate_mock_cds_lists( def generate_mock_protocluster(cds_list: list[bs_genbank.CDS], protocore_idx: int): """Generate a mock protocluster from a cds list""" - gbk = bs_genbank.GBK(None, "test", bs_enums.SOURCE_TYPE.QUERY) + gbk = bs_genbank.GBK(None, bs_enums.SOURCE_TYPE.QUERY) gbk.genes = cds_list protocluster = bs_genbank.ProtoCluster( gbk, 1, 0, len(cds_list) * 100, False, "", {} @@ -233,7 +233,7 @@ def test_find_protocore_type_check(self): 10, 10, [1, 2, 2, 2, 3], [1, 2, 2, 2, 3], False ) - gbk = bs_genbank.GBK("", "", bs_enums.SOURCE_TYPE.QUERY) + gbk = bs_genbank.GBK("", bs_enums.SOURCE_TYPE.QUERY) region = bs_genbank.Region(gbk, 1, 0, len(cds_a) * 100, False, "") @@ -248,7 +248,7 @@ def test_lcs_protocluster_type_check_a(self): 10, 10, [1, 2, 2, 2, 3], [1, 2, 2, 2, 3], False ) - gbk = bs_genbank.GBK("", "", bs_enums.SOURCE_TYPE.QUERY) + gbk = bs_genbank.GBK("", bs_enums.SOURCE_TYPE.QUERY) record_a = bs_genbank.Region(gbk, 1, 0, len(cds_a) * 100, False, "") record_b = bs_genbank.Region(gbk, 1, 0, len(cds_b) * 100, False, "") @@ -266,7 +266,7 @@ def test_lcs_protocluster_type_check_b(self): 10, 10, [1, 2, 2, 2, 3], [1, 2, 2, 2, 3], False ) - gbk = bs_genbank.GBK("", "", bs_enums.SOURCE_TYPE.QUERY) + gbk = bs_genbank.GBK("", bs_enums.SOURCE_TYPE.QUERY) record_a = bs_genbank.ProtoCluster( gbk, 1, 0, len(cds_a) * 100, False, "", "", {} diff --git a/test/comparison/test_scores.py b/test/comparison/test_scores.py index a1800558..990076df 100644 --- a/test/comparison/test_scores.py +++ b/test/comparison/test_scores.py @@ -62,7 +62,7 @@ def test_full_overlap_pair(self): a_domains = [""] b_domains = [""] - gbk_a = GBK("", "", "") + gbk_a = GBK("", "") gbk_a.region = Region(gbk_a, 0, 0, 100, False, "") for a_domain in a_domains: @@ -70,7 +70,7 @@ def test_full_overlap_pair(self): gbk_a.genes.append(cds_a) cds_a.hsps.append(HSP(cds_a, a_domain, 100, 0, 30)) - gbk_b = GBK("", "", "") + gbk_b = GBK("", "") gbk_b.region = Region(gbk_b, 0, 0, 100, False, "") for b_domain in b_domains: @@ -112,7 +112,7 @@ def test_partial_overlap_pair(self): "PF00005", ] - gbk_a = GBK("", "", "") + gbk_a = GBK("", "") gbk_a.region = Region(gbk_a, 0, 0, 100, False, "") for a_domain in a_domains: @@ -120,7 +120,7 @@ def test_partial_overlap_pair(self): gbk_a.genes.append(cds_a) cds_a.hsps.append(HSP(cds_a, a_domain, 100, 0, 30)) - gbk_b = GBK("", "", "") + gbk_b = GBK("", "") gbk_b.region = Region(gbk_b, 0, 0, 100, False, "") for b_domain in b_domains: @@ -160,7 +160,7 @@ def test_no_overlap_pair(self): "PF00004", ] - gbk_a = GBK("", "", "") + gbk_a = GBK("", "") gbk_a.region = Region(gbk_a, 0, 0, 100, False, "") gbk_a.region.parent_gbk = gbk_a @@ -169,7 +169,7 @@ def test_no_overlap_pair(self): gbk_a.genes.append(cds_a) cds_a.hsps.append(HSP(cds_a, a_domain, 100, 0, 30)) - gbk_b = GBK("", "", "") + gbk_b = GBK("", "") gbk_b.region = Region(gbk_b, 0, 0, 100, False, "") gbk_b.region.parent_gbk = gbk_b @@ -266,10 +266,10 @@ def test_all_adjacent_pair(self): "PF00003", ] - gbk_a = GBK("", "", "") + gbk_a = GBK("", "") gbk_a.region = Region(gbk_a, 0, 0, 100, False, "") - gbk_b = GBK("", "", "") + gbk_b = GBK("", "") gbk_b.region = Region(gbk_b, 0, 0, 100, False, "") for shared_domain in shared_domains: @@ -301,10 +301,10 @@ def test_get_distance_from_unshared_none(self): a_domains = ["A", "B", "C"] b_domains = ["A", "B", "C"] - gbk_a = GBK("", "", "") + gbk_a = GBK("", "") gbk_a.region = Region(gbk_a, 0, 0, 100, False, "") - gbk_b = GBK("", "", "") + gbk_b = GBK("", "") gbk_b.region = Region(gbk_b, 0, 0, 100, False, "") for a_domain in a_domains: @@ -333,10 +333,10 @@ def test_get_distance_from_unshared_no_anchor(self): a_domains = ["A", "B", "C"] b_domains = ["C", "D", "E"] - gbk_a = GBK("", "", "") + gbk_a = GBK("", "") gbk_a.region = Region(gbk_a, 0, 0, 100, False, "") - gbk_b = GBK("", "", "") + gbk_b = GBK("", "") gbk_b.region = Region(gbk_b, 0, 0, 100, False, "") for a_domain in a_domains: @@ -366,10 +366,10 @@ def test_get_distance_from_unshared_full(self): anchor_domains_set = set(["A", "E"]) - gbk_a = GBK("", "", "") + gbk_a = GBK("", "") gbk_a.region = Region(gbk_a, 0, 0, 100, False, "") - gbk_b = GBK("", "", "") + gbk_b = GBK("", "") gbk_b.region = Region(gbk_b, 0, 0, 100, False, "") for a_domain in a_domains: diff --git a/test/db/test_partial.py b/test/db/test_partial.py index 1d4d2188..6ed76b1e 100644 --- a/test/db/test_partial.py +++ b/test/db/test_partial.py @@ -26,7 +26,7 @@ def create_mock_gbk(i) -> GBK: - gbk = GBK(Path(f"test_path_{i}.gbk"), str(i), bs_enums.SOURCE_TYPE.QUERY) + gbk = GBK(Path(f"test_path_{i}.gbk"), bs_enums.SOURCE_TYPE.QUERY) cds = CDS(0, 100) cds.parent_gbk = gbk cds.orf_num = 1 @@ -116,7 +116,7 @@ def __init__(self, methodName: str = "runTest") -> None: def test_min_task_data(self): DB.create_in_mem() - expected_min_task = bs_enums.TASK.SAVE_GBKS + expected_min_task = bs_enums.TASK.LOAD_GBKS gbks = [create_mock_gbk(1)] @@ -223,7 +223,7 @@ def test_new_data(self): gbk_not_in_db = create_mock_gbk(2) gbks = [gbk_in_db, gbk_not_in_db] - expected_state = bs_enums.INPUT_TASK.MIXED_DATA + expected_state = bs_enums.INPUT_TASK.NEW_DATA actual_state = get_input_data_state(gbks) self.assertEqual(expected_state, actual_state) diff --git a/test/genbank/test_bgc_record.py b/test/genbank/test_bgc_record.py index f0408dac..9c09daa0 100644 --- a/test/genbank/test_bgc_record.py +++ b/test/genbank/test_bgc_record.py @@ -19,7 +19,7 @@ def test_get_cds(self): nt_start and nt_stop coordinates """ - gbk = GBK("", "", "") + gbk = GBK("", "") record = BGCRecord(gbk, 0, 10, 90, False, "") # 10 total @@ -50,7 +50,7 @@ def test_get_cds_all(self): parameter set to True """ - gbk = GBK("", "", "") + gbk = GBK("", "") record = BGCRecord(gbk, 0, 10, 90, False, "") gbk.genes = [ @@ -81,7 +81,7 @@ def test_get_hsps(self): """ domains = ["PF00001", "PF00002", "PF00003", "PF00004", "PF00005"] - gbk = GBK("", "", "") + gbk = GBK("", "") gbk.region = Region(gbk, 0, 0, 100, False, "") cds = CDS(10, 90) gbk.genes.append(cds) @@ -116,7 +116,7 @@ def test_get_cds_with_domains(self): cds_list[4].hsps = [HSP(cds_list[0], "test", 1.0, 0, 100)] cds_list[6].hsps = [HSP(cds_list[0], "test", 1.0, 0, 100)] - gbk = GBK("", "", "test") + gbk = GBK("", "test") gbk.genes = cds_list region = BGCRecord(gbk, 0, 0, 100, False, "") diff --git a/test/genbank/test_gbk.py b/test/genbank/test_gbk.py index 3e1e907a..d68fc16c 100644 --- a/test/genbank/test_gbk.py +++ b/test/genbank/test_gbk.py @@ -406,7 +406,7 @@ def test_parse_gbk_has_dna_seq(self): def test_filter_overlap_over_threshold(self): """Test whether add_cds_filter_overlap correctly throws out a cds""" - gbk = GBK("", "", "") + gbk = GBK("", "") cds_a = CDS(0, 18) cds_a.aa_seq = "M" * 6 @@ -433,7 +433,7 @@ def test_filter_overlap_under_threshold(self): another CDS but under the cutoff threshold """ - gbk = GBK("", "", "") + gbk = GBK("", "") cds_a = CDS(0, 18) cds_a.aa_seq = "M" * 6 @@ -456,7 +456,7 @@ def test_filter_overlap_over_threshold_diff_strands(self): cutoff threshold, but is on a different strand """ - gbk = GBK("", "", "") + gbk = GBK("", "") cds_a = CDS(0, 18) cds_a.aa_seq = "M" * 6 @@ -554,32 +554,6 @@ def test_load_all(self): self.assertEqual(expected_gbk_count, actual_gbk_count) - def test_load_many(self): - """Tests whether a set of GBKs can be recreated from a database - using load many - """ - DB.create_in_mem() - - gbk_file_path = Path("test/test_data/valid_gbk_folder/valid_input_region.gbk") - run = { - "input_dir": Path("test/test_data/valid_gbk_folder/"), - "input_mode": bs_enums.INPUT_MODE.RECURSIVE, - "include_gbk": None, - "exclude_gbk": None, - "cds_overlap_cutoff": None, - "cores": None, - "classify": False, - "legacy_classify": False, - } - - gbk = GBK.parse(gbk_file_path, SOURCE_TYPE.QUERY, run) - - gbk.save() - - loaded_gbks = GBK.load_many([gbk]) - - self.assertEqual(gbk, loaded_gbks[0]) - def test_load_all_has_regions(self): """Tests whether the region objects were correctly loaded when loading GBKs""" populated_db_path = Path("test/test_data/database/valid_populated.db") diff --git a/test/hmm/test_hmm_align.py b/test/hmm/test_hmm_align.py index 102a017c..2fa86391 100644 --- a/test/hmm/test_hmm_align.py +++ b/test/hmm/test_hmm_align.py @@ -70,7 +70,7 @@ def test_save_hmmalignment(self): "IMATEGYQSSGSSNITVSG" ) - gbk = GBK("", "", bs_enums.SOURCE_TYPE.QUERY) + gbk = GBK("", bs_enums.SOURCE_TYPE.QUERY) gbk.metadata = {"organism": "test", "taxonomy": "test", "description": "test"} gbk.save() diff --git a/test/hmm/test_hmm_scan.py b/test/hmm/test_hmm_scan.py index 26f5d2e2..a4a76164 100644 --- a/test/hmm/test_hmm_scan.py +++ b/test/hmm/test_hmm_scan.py @@ -142,7 +142,7 @@ def test_save_hsp(self): "IMATEGYQSSGSSNITVSG" ) - gbk = GBK("", "", bs_enums.SOURCE_TYPE.QUERY) + gbk = GBK("", bs_enums.SOURCE_TYPE.QUERY) gbk.metadata = {"organism": "test", "taxonomy": "test", "description": "test"} gbk.save() diff --git a/test/network/test_network.py b/test/network/test_network.py index 388f9479..2a0aaa24 100644 --- a/test/network/test_network.py +++ b/test/network/test_network.py @@ -16,9 +16,7 @@ def create_mock_gbk(i) -> bs_gbk.GBK: - gbk = bs_gbk.GBK( - pathlib.Path(f"test_path_{i}.gbk"), str(i), bs_enums.SOURCE_TYPE.QUERY - ) + gbk = bs_gbk.GBK(pathlib.Path(f"test_path_{i}.gbk"), bs_enums.SOURCE_TYPE.QUERY) cds = bs_gbk.CDS(0, 100) cds.parent_gbk = gbk cds.orf_num = 1 diff --git a/test/test_data/database/valid_family.db b/test/test_data/database/valid_family.db index 26ac75fcc8b493e13cde829b1139c7e14652edd6..bad96148eb09ed90bfa84ad873b418b73fc8d555 100755 GIT binary patch delta 110 zcmZoTAlYz0a)PuV8v_GF8W6((+e95>el`ZZcv)VaZw!JgwhX)t8}qt(7`rAj^0sqU zGO~*c3o|xHPM*jc!(E(TT9lj`Us73++We8X{Ua|U5HkTWGZ3=?G3)k^ylnhI0B=nm A8UO$Q delta 103 zcmZoTAlYz0a)PuVI|Bnl8W6((`$QdMes%`Ecv)VaZw!JgwhX-O8}qt(7$;7g*vr*e t#K&KJ-LfFh9x7hIHUO^Z~I4HMj&PaVrC#_0bmMg8#!yHJ;R2I*0xB-T zHRvt*ExzhA&AH9QLicZla3qAVZ_pA#?3Vem+%9{%?5@9lowB$}mU8azsZT{TP0sqO zS7ms{?~_q~@b>XJ9Ly~BV<`z6>~P=!PW+z>hj7D#BOK!dUijcg02+b_A&dx45ycsF zoFj${#F0P}mq;Ov46cyHHEwW=9P%jO4n>r3j|Y@dK@~MT;t9`qK^+Y=(Lx)>#!{W$ F-yZ6kY})_; delta 336 zcmWm3yK(_h0D$4MEJp-cHXwahR3YpQbPOs1?Hlv6#MxosqPoVN7UPGNS ze~PdA9B1uu@MAV`0;-MTp)-`gb+pqSI`he3_7k6#|;v=MG`3(NF#$R?vO(s x1r%|Q2b568BcAY#7gX?yH@w3{6*bh+Koc#r(LonI^fAB?BaC4kUHM`E{sDm4YRv!u diff --git a/test/trees/test_alignment.py b/test/trees/test_alignment.py index aeb67948..6a8787a5 100644 --- a/test/trees/test_alignment.py +++ b/test/trees/test_alignment.py @@ -18,8 +18,8 @@ def test_tree_gen_small(self): """Tests generated tree for families with less than three members""" records = [ - BGCRecord(GBK("", "", ""), 0, 0, 0, False, ""), - BGCRecord(GBK("", "", ""), 0, 0, 0, False, ""), + BGCRecord(GBK("", ""), 0, 0, 0, False, ""), + BGCRecord(GBK("", ""), 0, 0, 0, False, ""), ] exemplar = 0 mock_family = [0, 1] @@ -31,8 +31,8 @@ def test_tree_gen_small(self): def test_gcf_alignment(self): """Tests alignment of GCF HSP alignments""" - gbk_a = GBK("", "", "") - gbk_b = GBK("", "", "") + gbk_a = GBK("", "") + gbk_b = GBK("", "") cds_a = CDS(0, 20) cds_b = CDS(0, 20) hsp_a = HSP(cds_a, "PF1", 1, 0, 10) @@ -72,7 +72,7 @@ def test_lcs_adjust_ex2mem(self): expected_lcs = (5, 8, False) new_lcs = adjust_lcs_to_all_genes( - mock_result, 0, 1, GBK("", "", ""), GBK("", "", ""), dom_to_gene, dom_count + mock_result, 0, 1, GBK("", ""), GBK("", ""), dom_to_gene, dom_count ) self.assertEqual(new_lcs, expected_lcs) @@ -94,7 +94,7 @@ def test_lcs_adjust_ex2mem_reverse(self): expected_lcs = (4, 2, True) new_lcs = adjust_lcs_to_all_genes( - mock_result, 0, 1, GBK("", "", ""), GBK("", "", ""), dom_to_gene, dom_count + mock_result, 0, 1, GBK("", ""), GBK("", ""), dom_to_gene, dom_count ) self.assertEqual(new_lcs, expected_lcs) @@ -116,7 +116,7 @@ def test_lcs_adjust_mem2ex(self): expected_lcs = (2, 4, False) new_lcs = adjust_lcs_to_all_genes( - mock_result, 1, 0, GBK("", "", ""), GBK("", "", ""), dom_to_gene, dom_count + mock_result, 1, 0, GBK("", ""), GBK("", ""), dom_to_gene, dom_count ) self.assertEqual(new_lcs, expected_lcs) @@ -138,7 +138,7 @@ def test_lcs_adjust_mem2ex_reverse(self): expected_lcs = (2, 4, True) new_lcs = adjust_lcs_to_all_genes( - mock_result, 1, 0, GBK("", "", ""), GBK("", "", ""), dom_to_gene, dom_count + mock_result, 1, 0, GBK("", ""), GBK("", ""), dom_to_gene, dom_count ) self.assertEqual(new_lcs, expected_lcs) @@ -163,10 +163,10 @@ def test_lcs_adjust_zero_length_not_reversed(self): fill_cds = CDS(10, 20) fill_cds.strand = -1 - exempl_gbk = GBK("", "", "") + exempl_gbk = GBK("", "") exempl_gbk.genes = [fill_cds, fill_cds, max_dom_cds, fill_cds, fill_cds] - mem_gbk = GBK("", "", "") + mem_gbk = GBK("", "") mem_gbk.genes = [CDS(0, 1), CDS(2, 3), CDS(4, 5), max_dom_cds] expected_lcs = (2, 3, False) @@ -197,12 +197,12 @@ def test_lcs_adjust_zero_length_reversed(self): fill_cds = CDS(0, 0) fill_cds.strand = -1 - exempl_gbk = GBK("", "", "") + exempl_gbk = GBK("", "") exempl_gbk.genes = [fill_cds, fill_cds, max_dom_cds, fill_cds, fill_cds] max_dom_cds_rev = CDS(0, 0) max_dom_cds_rev.strand = -1 - mem_gbk = GBK("", "", "") + mem_gbk = GBK("", "") mem_gbk.genes = [CDS(0, 0), CDS(0, 0), CDS(0, 0), max_dom_cds_rev] expected_lcs = (2, 1, True)