medema-group · adraismawur · Nov 14, 2023 · Nov 14, 2023
diff --git a/big_scape/data/partial_task.py b/big_scape/data/partial_task.py
@@ -29,14 +29,8 @@ def find_minimum_task(gbks: list[GBK]):
     """
     input_data_state = get_input_data_state(gbks)
 
-    # new data or mixed data
-    if (
-        input_data_state.value == bs_enums.INPUT_TASK.NEW_DATA.value
-        or input_data_state.value == bs_enums.INPUT_TASK.MIXED_DATA.value
-        or input_data_state.value == bs_enums.INPUT_TASK.NO_DATA.value
-    ):
-        # gbks from input need to be loaded into the in-memory database
-        return bs_enums.TASK.SAVE_GBKS
+    if input_data_state.value < bs_enums.INPUT_TASK.SAME_DATA.value:
+        return bs_enums.TASK.LOAD_GBKS
 
     hmm_data_state = get_hmm_data_state(gbks)
 
@@ -68,20 +62,20 @@ def get_input_data_state(gbks: list[GBK]) -> bs_enums.INPUT_TASK:
 
     # get set of gbks in database
     db_gbk_rows = DB.execute(gbk_table.select()).all()
-    db_gbk_hashes: set[str] = {db_gbk_row[2] for db_gbk_row in db_gbk_rows}
-    input_gbk_hashes: set[str] = {str(gbk.hash) for gbk in gbks}
+    db_gbk_paths: set[str] = {db_gbk_row[1] for db_gbk_row in db_gbk_rows}
+    input_gbk_paths: set[str] = {str(gbk.path) for gbk in gbks}
 
-    if db_gbk_hashes == input_gbk_hashes:
+    if db_gbk_paths == input_gbk_paths:
         return bs_enums.INPUT_TASK.SAME_DATA
 
-    union = db_gbk_hashes & input_gbk_hashes
+    sym_dif = db_gbk_paths.symmetric_difference(input_gbk_paths)
 
-    # all new data
-    if len(union) == 0:
+    # still same amount in db. new data
+    if len(sym_dif) == len(db_gbk_paths):
         return bs_enums.INPUT_TASK.NEW_DATA
 
-    # only partial data which is already in database
-    if len(union) == len(input_gbk_hashes):
+    # same amount in new data. there was more in db than in new data
+    if len(sym_dif) == len(input_gbk_paths):
         return bs_enums.INPUT_TASK.PARTIAL_DATA
 
     # otherwise there is some new data, some old data is missing
@@ -98,7 +92,7 @@ def get_missing_gbks(gbks: list[GBK]) -> list[GBK]:
         list[GBK]: List of GBKs that are missing from the database
     """
     # dictionary of gbk path to gbk object
-    gbk_dict = {str(gbk.hash): gbk for gbk in gbks}
+    gbk_dict = {str(gbk.path): gbk for gbk in gbks}
 
     if not DB.metadata:
         raise RuntimeError("DB.metadata is None")
@@ -107,13 +101,13 @@ def get_missing_gbks(gbks: list[GBK]) -> list[GBK]:
 
     # get set of gbks in database
     db_gbk_rows = DB.execute(gbk_table.select()).all()
-    db_gbk_hashes: set[int] = {db_gbk_row[2] for db_gbk_row in db_gbk_rows}
+    db_gbk_paths: set[int] = {db_gbk_row[1] for db_gbk_row in db_gbk_rows}
 
     missing_gbks = []
 
-    for gbk_hash in gbk_dict:
-        if gbk_hash not in db_gbk_hashes:
-            missing_gbks.append(gbk_dict[gbk_hash])
+    for gbk_path in gbk_dict:
+        if gbk_path not in db_gbk_paths:
+            missing_gbks.append(gbk_dict[gbk_path])
 
     return missing_gbks
 

diff --git a/big_scape/data/schema.sql b/big_scape/data/schema.sql
@@ -5,12 +5,11 @@
 CREATE TABLE IF NOT EXISTS gbk (
     id INTEGER PRIMARY KEY AUTOINCREMENT,
     path TEXT,
-    hash TEXT,
     nt_seq TEXT,
     organism TEXT,
     taxonomy TEXT,
     description TEXT,
-    UNIQUE(hash)
+    UNIQUE(path)
 );
 
 CREATE TABLE IF NOT EXISTS bgc_record (

diff --git a/big_scape/enums/partial_task.py b/big_scape/enums/partial_task.py
@@ -4,7 +4,7 @@
 
 
 class TASK(Enum):
-    SAVE_GBKS = 0
+    LOAD_GBKS = 0
     HMM_SCAN = 1
     HMM_ALIGN = 2
     COMPARISON = 3

diff --git a/big_scape/file_input/load_files.py b/big_scape/file_input/load_files.py
@@ -290,20 +290,20 @@ def load_gbks(run: dict, bigscape_dir: Path) -> list[GBK]:
     bs_data.DB.load_from_disk(run["db_path"])
     task_state = bs_data.find_minimum_task(input_gbks)
 
-    source_dict = {gbk.hash: gbk.source_type for gbk in input_gbks}
-
-    # if we are are not on the save_gbks task, we have all the data we need in the database
-    # and can just load it all into the correct python objects
-    if task_state != bs_enums.TASK.SAVE_GBKS:
-        # here we dont save anything to DB, data goes DB -> python objects
+    # if we are are not on the load_gbks task, we have all the data we need
+    if task_state != bs_enums.TASK.LOAD_GBKS:
         logging.info("Loading existing run from disk...")
 
-        input_gbks_from_db = GBK.load_many(input_gbks)
-        for gbk in input_gbks_from_db:
-            gbk.source_type = source_dict[gbk.hash]
+        source_dict = {gbk.path: gbk.source_type for gbk in input_gbks}
+
+        gbks_from_db = GBK.load_all()
+        for gbk in gbks_from_db:
+            gbk.source_type = source_dict[gbk.path]
+
+        for gbk in gbks_from_db:
             bs_hmm.HSP.load_all(gbk.genes)
 
-        return input_gbks_from_db
+        return gbks_from_db
 
     # if we end up here, we are in some halfway state and need to load in the new data
     logging.info("Loading existing run from disk and adding new data...")
@@ -313,11 +313,5 @@ def load_gbks(run: dict, bigscape_dir: Path) -> list[GBK]:
     for gbk in missing_gbks:
         gbk.save_all()
 
-    # now we have all new data in the database, we can load it all in to the correct
-    # python objects
-    input_gbks_from_db = GBK.load_many(input_gbks)
-    for gbk in input_gbks_from_db:
-        gbk.source_type = source_dict[gbk.hash]
-        bs_hmm.HSP.load_all(gbk.genes)
-
-    return input_gbks_from_db
+    # still return the full set
+    return input_gbks
diff --git a/big_scape/genbank/candidate_cluster.py b/big_scape/genbank/candidate_cluster.py
@@ -188,7 +188,6 @@ def load_all(region_dict: dict[int, Region]):
                 record_table.c.product,
             )
             .where(record_table.c.record_type == "cand_cluster")
-            .where(record_table.c.parent_id.in_(region_dict.keys()))
             .compile()
         )
 

diff --git a/big_scape/genbank/cds.py b/big_scape/genbank/cds.py
@@ -354,7 +354,6 @@ def load_all(gbk_dict: dict[int, GBK]) -> None:
                 cds_table.c.aa_seq,
             )
             .order_by(cds_table.c.orf_num)
-            .where(cds_table.c.gbk_id.in_(gbk_dict.keys()))
             .compile()
         )
 

diff --git a/big_scape/genbank/gbk.py b/big_scape/genbank/gbk.py
@@ -7,7 +7,6 @@
 # from enum import Enum
 from pathlib import Path
 from typing import Dict, Optional
-import hashlib
 
 
 # from dependencies
@@ -48,9 +47,8 @@ class GBK:
         source_type: SOURCE_TYPE
     """
 
-    def __init__(self, path, hash, source_type) -> None:
+    def __init__(self, path, source_type) -> None:
         self.path: Path = path
-        self.hash: str = hash
         self.metadata: Dict[str, str] = {}
         self.region: Optional[Region] = None
         self.nt_seq: SeqRecord.seq = None
@@ -160,7 +158,6 @@ def save(self, commit=True) -> None:
             gbk_table.insert()
             .values(
                 path=str(self.path),
-                hash=str(self.hash),
                 nt_seq=str(self.nt_seq),
                 organism=organism,
                 taxonomy=taxonomy,
@@ -217,7 +214,6 @@ def load_all() -> list[GBK]:
             gbk_table.select()
             .add_columns(
                 gbk_table.c.id,
-                gbk_table.c.hash,
                 gbk_table.c.path,
                 gbk_table.c.nt_seq,
                 gbk_table.c.organism,
@@ -231,7 +227,7 @@ def load_all() -> list[GBK]:
 
         gbk_dict = {}
         for result in cursor_result.all():
-            new_gbk = GBK(Path(result.path), result.hash, "")
+            new_gbk = GBK(Path(result.path), "")
             new_gbk._db_id = result.id
             new_gbk.nt_seq = result.nt_seq
             new_gbk.metadata["organism"] = result.organism
@@ -249,7 +245,51 @@ def load_all() -> list[GBK]:
         return list(gbk_dict.values())
 
     @staticmethod
-    def load_many(input_gbks: list[GBK]) -> list[GBK]:
+    def load_one(gbk_id: int) -> GBK:
+        """Load a single GBK object from the database
+
+        Args:
+            gbk_id (int): id of gbk to load
+
+        Returns:
+            GBK: loaded GBK object
+        """
+
+        if not DB.metadata:
+            raise RuntimeError("DB.metadata is None")
+
+        gbk_table = DB.metadata.tables["gbk"]
+        select_query = (
+            gbk_table.select()
+            .add_columns(
+                gbk_table.c.id,
+                gbk_table.c.path,
+                gbk_table.c.source_type,
+                gbk_table.c.nt_seq,
+                gbk_table.c.organism,
+                gbk_table.c.taxonomy,
+                gbk_table.c.description,
+            )
+            .where(gbk_table.c.id == gbk_id)
+            .compile()
+        )
+
+        result = DB.execute(select_query).fetchone()
+
+        if result is None:
+            raise RuntimeError(f"No GBK with id {gbk_id}")
+
+        new_gbk = GBK(Path(result.path), result.source_type)
+        new_gbk._db_id = result.id
+        new_gbk.nt_seq = result.nt_seq
+        new_gbk.metadata["organism"] = result.organism
+        new_gbk.metadata["taxonomy"] = result.taxonomy
+        new_gbk.metadata["description"] = result.description
+
+        return new_gbk
+
+    @staticmethod
+    def load_many(gbk_ids: list[int]) -> list[GBK]:
         """Load a list of GBK objects from the database
 
         Args:
@@ -259,8 +299,6 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
             list[GBK]: loaded GBK objects
         """
 
-        input_gbk_hashes = [gbk.hash for gbk in input_gbks]
-
         if not DB.metadata:
             raise RuntimeError("DB.metadata is None")
 
@@ -269,22 +307,22 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
             gbk_table.select()
             .add_columns(
                 gbk_table.c.id,
-                gbk_table.c.hash,
                 gbk_table.c.path,
+                gbk_table.c.source_type,
                 gbk_table.c.nt_seq,
                 gbk_table.c.organism,
                 gbk_table.c.taxonomy,
                 gbk_table.c.description,
             )
-            .where(gbk_table.c.hash.in_(input_gbk_hashes))
+            .where(gbk_table.c.id.in_(gbk_ids))
             .compile()
         )
 
         cursor_result = DB.execute(select_query)
 
         gbk_dict = {}
         for result in cursor_result.all():
-            new_gbk = GBK(Path(result.path), result.hash, "")
+            new_gbk = GBK(Path(result.path), result.source_type)
             new_gbk._db_id = result.id
             new_gbk.nt_seq = result.nt_seq
             new_gbk.metadata["organism"] = result.organism
@@ -342,14 +380,7 @@ def parse(
             GBK: GBK object
         """
 
-        # get unique content hash
-        f = open(path, "r")
-        data = f.read()
-        f.close()
-        data = data.encode("utf-8")  # type: ignore
-        hash = hashlib.sha256(data).hexdigest()  # type: ignore
-
-        gbk = cls(path, hash, source_type)
+        gbk = cls(path, source_type)
 
         # get record. should only ever be one for Antismash GBK
         record: SeqRecord = next(SeqIO.parse(path, "genbank"))
@@ -559,13 +590,13 @@ def __repr__(self) -> str:
         return f"GBK {self.path.name}, {len(self.genes)} genes"
 
     def __hash__(self) -> int:
-        return hash(self.hash)
+        return hash(self.path)
 
     def __eq__(self, other) -> bool:
         if not isinstance(other, GBK):
             return False
 
-        if self.hash is None or other.hash is None:
+        if self.path is None or other.path is None:
             return False
 
-        return self.hash == other.hash
+        return self.path == other.path
diff --git a/big_scape/genbank/proto_cluster.py b/big_scape/genbank/proto_cluster.py
@@ -204,7 +204,6 @@ def load_all(candidate_cluster_dict: dict[int, CandidateCluster]):
                 record_table.c.category,
             )
             .where(record_table.c.record_type == "protocluster")
-            .where(record_table.c.parent_id.in_(candidate_cluster_dict.keys()))
             .compile()
         )
 

diff --git a/big_scape/genbank/proto_core.py b/big_scape/genbank/proto_core.py
@@ -135,7 +135,6 @@ def load_all(protocluster_dict: dict[int, ProtoCluster]):
                 record_table.c.category,
             )
             .where(record_table.c.record_type == "proto_core")
-            .where(record_table.c.parent_id.in_(protocluster_dict.keys()))
             .compile()
         )
 

diff --git a/big_scape/genbank/region.py b/big_scape/genbank/region.py
@@ -241,7 +241,6 @@ def load_all(gbk_dict: dict[int, GBK]) -> None:
                 record_table.c.product,
             )
             .where(record_table.c.record_type == "region")
-            .where(record_table.c.gbk_id.in_(gbk_dict.keys()))
             .compile()
         )
 

diff --git a/big_scape/output/html_template/output/html_content/js/bigscape.js b/big_scape/output/html_template/output/html_content/js/bigscape.js
@@ -323,7 +323,7 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n
   // construct the graph
   for (var i = 0; i < bs_data.length; i++) {
     var bs_obj = bs_data[i];
-    graph.addNode(i, { id: bs_obj["id"], hash: bs_obj["hash"], cl: bs_to_cl[i] });
+    graph.addNode(i, { id: bs_obj["id"], cl: bs_to_cl[i] });
   }
   for (var a = 0; a < bs_data.length; a++) {
     for (var b = 0; b < bs_data.length; b++) {
@@ -449,7 +449,7 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n
       .attr("stroke", "#777")
       .attr("stroke-width", link["data"]["weight"] * 10);
 
-    if (graph.getNode(link.fromId).data.hash === graph.getNode(link.toId).data.hash) {
+    if (graph.getNode(link.fromId).data.id === graph.getNode(link.toId).data.id) {
       line = line.attr("stroke-dasharray", "10,10")
     }
     return line

diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py
@@ -576,7 +576,6 @@ def generate_bs_data_js(
             "start: int,
             "end": int,
             "id": str, (e.g. AL645882.2.cluster010),
-            "hash": str,
             "mibig": bool,
             "source": str, (e.g. mibig, reference, or query),
             "record_start": int, (e.g. cds boundaries of protocluster, index starts at 1)
@@ -630,10 +629,7 @@ def generate_bs_data_js(
                 "desc": organism,
                 "start": 1,
                 "end": len(gbk.nt_seq),
-                "id": "_".join(
-                    [gbk.path.name, type(record).__name__.lower(), str(record.number)]
-                ),
-                "hash": gbk.hash,
+                "id": gbk.path.name,
                 "mibig": gbk.source_type == SOURCE_TYPE.MIBIG,
                 "source": gbk.source_type.name.lower(),
                 "record_start": rec_start,