Skip to content

Commit

Permalink
Merge pull request #82 from medema-group/revert-80-feature/input-subset
Browse files Browse the repository at this point in the history
Revert "Feature/input subset"
  • Loading branch information
adraismawur authored Nov 14, 2023
2 parents 5506728 + e2c6122 commit c33385e
Show file tree
Hide file tree
Showing 25 changed files with 150 additions and 173 deletions.
36 changes: 15 additions & 21 deletions big_scape/data/partial_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,8 @@ def find_minimum_task(gbks: list[GBK]):
"""
input_data_state = get_input_data_state(gbks)

# new data or mixed data
if (
input_data_state.value == bs_enums.INPUT_TASK.NEW_DATA.value
or input_data_state.value == bs_enums.INPUT_TASK.MIXED_DATA.value
or input_data_state.value == bs_enums.INPUT_TASK.NO_DATA.value
):
# gbks from input need to be loaded into the in-memory database
return bs_enums.TASK.SAVE_GBKS
if input_data_state.value < bs_enums.INPUT_TASK.SAME_DATA.value:
return bs_enums.TASK.LOAD_GBKS

hmm_data_state = get_hmm_data_state(gbks)

Expand Down Expand Up @@ -68,20 +62,20 @@ def get_input_data_state(gbks: list[GBK]) -> bs_enums.INPUT_TASK:

# get set of gbks in database
db_gbk_rows = DB.execute(gbk_table.select()).all()
db_gbk_hashes: set[str] = {db_gbk_row[2] for db_gbk_row in db_gbk_rows}
input_gbk_hashes: set[str] = {str(gbk.hash) for gbk in gbks}
db_gbk_paths: set[str] = {db_gbk_row[1] for db_gbk_row in db_gbk_rows}
input_gbk_paths: set[str] = {str(gbk.path) for gbk in gbks}

if db_gbk_hashes == input_gbk_hashes:
if db_gbk_paths == input_gbk_paths:
return bs_enums.INPUT_TASK.SAME_DATA

union = db_gbk_hashes & input_gbk_hashes
sym_dif = db_gbk_paths.symmetric_difference(input_gbk_paths)

# all new data
if len(union) == 0:
# still same amount in db. new data
if len(sym_dif) == len(db_gbk_paths):
return bs_enums.INPUT_TASK.NEW_DATA

# only partial data which is already in database
if len(union) == len(input_gbk_hashes):
# same amount in new data. there was more in db than in new data
if len(sym_dif) == len(input_gbk_paths):
return bs_enums.INPUT_TASK.PARTIAL_DATA

# otherwise there is some new data, some old data is missing
Expand All @@ -98,7 +92,7 @@ def get_missing_gbks(gbks: list[GBK]) -> list[GBK]:
list[GBK]: List of GBKs that are missing from the database
"""
# dictionary of gbk path to gbk object
gbk_dict = {str(gbk.hash): gbk for gbk in gbks}
gbk_dict = {str(gbk.path): gbk for gbk in gbks}

if not DB.metadata:
raise RuntimeError("DB.metadata is None")
Expand All @@ -107,13 +101,13 @@ def get_missing_gbks(gbks: list[GBK]) -> list[GBK]:

# get set of gbks in database
db_gbk_rows = DB.execute(gbk_table.select()).all()
db_gbk_hashes: set[int] = {db_gbk_row[2] for db_gbk_row in db_gbk_rows}
db_gbk_paths: set[int] = {db_gbk_row[1] for db_gbk_row in db_gbk_rows}

missing_gbks = []

for gbk_hash in gbk_dict:
if gbk_hash not in db_gbk_hashes:
missing_gbks.append(gbk_dict[gbk_hash])
for gbk_path in gbk_dict:
if gbk_path not in db_gbk_paths:
missing_gbks.append(gbk_dict[gbk_path])

return missing_gbks

Expand Down
3 changes: 1 addition & 2 deletions big_scape/data/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
CREATE TABLE IF NOT EXISTS gbk (
id INTEGER PRIMARY KEY AUTOINCREMENT,
path TEXT,
hash TEXT,
nt_seq TEXT,
organism TEXT,
taxonomy TEXT,
description TEXT,
UNIQUE(hash)
UNIQUE(path)
);

CREATE TABLE IF NOT EXISTS bgc_record (
Expand Down
2 changes: 1 addition & 1 deletion big_scape/enums/partial_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


class TASK(Enum):
SAVE_GBKS = 0
LOAD_GBKS = 0
HMM_SCAN = 1
HMM_ALIGN = 2
COMPARISON = 3
Expand Down
30 changes: 12 additions & 18 deletions big_scape/file_input/load_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,20 +290,20 @@ def load_gbks(run: dict, bigscape_dir: Path) -> list[GBK]:
bs_data.DB.load_from_disk(run["db_path"])
task_state = bs_data.find_minimum_task(input_gbks)

source_dict = {gbk.hash: gbk.source_type for gbk in input_gbks}

# if we are are not on the save_gbks task, we have all the data we need in the database
# and can just load it all into the correct python objects
if task_state != bs_enums.TASK.SAVE_GBKS:
# here we dont save anything to DB, data goes DB -> python objects
# if we are are not on the load_gbks task, we have all the data we need
if task_state != bs_enums.TASK.LOAD_GBKS:
logging.info("Loading existing run from disk...")

input_gbks_from_db = GBK.load_many(input_gbks)
for gbk in input_gbks_from_db:
gbk.source_type = source_dict[gbk.hash]
source_dict = {gbk.path: gbk.source_type for gbk in input_gbks}

gbks_from_db = GBK.load_all()
for gbk in gbks_from_db:
gbk.source_type = source_dict[gbk.path]

for gbk in gbks_from_db:
bs_hmm.HSP.load_all(gbk.genes)

return input_gbks_from_db
return gbks_from_db

# if we end up here, we are in some halfway state and need to load in the new data
logging.info("Loading existing run from disk and adding new data...")
Expand All @@ -313,11 +313,5 @@ def load_gbks(run: dict, bigscape_dir: Path) -> list[GBK]:
for gbk in missing_gbks:
gbk.save_all()

# now we have all new data in the database, we can load it all in to the correct
# python objects
input_gbks_from_db = GBK.load_many(input_gbks)
for gbk in input_gbks_from_db:
gbk.source_type = source_dict[gbk.hash]
bs_hmm.HSP.load_all(gbk.genes)

return input_gbks_from_db
# still return the full set
return input_gbks
1 change: 0 additions & 1 deletion big_scape/genbank/candidate_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,6 @@ def load_all(region_dict: dict[int, Region]):
record_table.c.product,
)
.where(record_table.c.record_type == "cand_cluster")
.where(record_table.c.parent_id.in_(region_dict.keys()))
.compile()
)

Expand Down
1 change: 0 additions & 1 deletion big_scape/genbank/cds.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,6 @@ def load_all(gbk_dict: dict[int, GBK]) -> None:
cds_table.c.aa_seq,
)
.order_by(cds_table.c.orf_num)
.where(cds_table.c.gbk_id.in_(gbk_dict.keys()))
.compile()
)

Expand Down
77 changes: 54 additions & 23 deletions big_scape/genbank/gbk.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
# from enum import Enum
from pathlib import Path
from typing import Dict, Optional
import hashlib


# from dependencies
Expand Down Expand Up @@ -48,9 +47,8 @@ class GBK:
source_type: SOURCE_TYPE
"""

def __init__(self, path, hash, source_type) -> None:
def __init__(self, path, source_type) -> None:
self.path: Path = path
self.hash: str = hash
self.metadata: Dict[str, str] = {}
self.region: Optional[Region] = None
self.nt_seq: SeqRecord.seq = None
Expand Down Expand Up @@ -160,7 +158,6 @@ def save(self, commit=True) -> None:
gbk_table.insert()
.values(
path=str(self.path),
hash=str(self.hash),
nt_seq=str(self.nt_seq),
organism=organism,
taxonomy=taxonomy,
Expand Down Expand Up @@ -217,7 +214,6 @@ def load_all() -> list[GBK]:
gbk_table.select()
.add_columns(
gbk_table.c.id,
gbk_table.c.hash,
gbk_table.c.path,
gbk_table.c.nt_seq,
gbk_table.c.organism,
Expand All @@ -231,7 +227,7 @@ def load_all() -> list[GBK]:

gbk_dict = {}
for result in cursor_result.all():
new_gbk = GBK(Path(result.path), result.hash, "")
new_gbk = GBK(Path(result.path), "")
new_gbk._db_id = result.id
new_gbk.nt_seq = result.nt_seq
new_gbk.metadata["organism"] = result.organism
Expand All @@ -249,7 +245,51 @@ def load_all() -> list[GBK]:
return list(gbk_dict.values())

@staticmethod
def load_many(input_gbks: list[GBK]) -> list[GBK]:
def load_one(gbk_id: int) -> GBK:
"""Load a single GBK object from the database
Args:
gbk_id (int): id of gbk to load
Returns:
GBK: loaded GBK object
"""

if not DB.metadata:
raise RuntimeError("DB.metadata is None")

gbk_table = DB.metadata.tables["gbk"]
select_query = (
gbk_table.select()
.add_columns(
gbk_table.c.id,
gbk_table.c.path,
gbk_table.c.source_type,
gbk_table.c.nt_seq,
gbk_table.c.organism,
gbk_table.c.taxonomy,
gbk_table.c.description,
)
.where(gbk_table.c.id == gbk_id)
.compile()
)

result = DB.execute(select_query).fetchone()

if result is None:
raise RuntimeError(f"No GBK with id {gbk_id}")

new_gbk = GBK(Path(result.path), result.source_type)
new_gbk._db_id = result.id
new_gbk.nt_seq = result.nt_seq
new_gbk.metadata["organism"] = result.organism
new_gbk.metadata["taxonomy"] = result.taxonomy
new_gbk.metadata["description"] = result.description

return new_gbk

@staticmethod
def load_many(gbk_ids: list[int]) -> list[GBK]:
"""Load a list of GBK objects from the database
Args:
Expand All @@ -259,8 +299,6 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
list[GBK]: loaded GBK objects
"""

input_gbk_hashes = [gbk.hash for gbk in input_gbks]

if not DB.metadata:
raise RuntimeError("DB.metadata is None")

Expand All @@ -269,22 +307,22 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
gbk_table.select()
.add_columns(
gbk_table.c.id,
gbk_table.c.hash,
gbk_table.c.path,
gbk_table.c.source_type,
gbk_table.c.nt_seq,
gbk_table.c.organism,
gbk_table.c.taxonomy,
gbk_table.c.description,
)
.where(gbk_table.c.hash.in_(input_gbk_hashes))
.where(gbk_table.c.id.in_(gbk_ids))
.compile()
)

cursor_result = DB.execute(select_query)

gbk_dict = {}
for result in cursor_result.all():
new_gbk = GBK(Path(result.path), result.hash, "")
new_gbk = GBK(Path(result.path), result.source_type)
new_gbk._db_id = result.id
new_gbk.nt_seq = result.nt_seq
new_gbk.metadata["organism"] = result.organism
Expand Down Expand Up @@ -342,14 +380,7 @@ def parse(
GBK: GBK object
"""

# get unique content hash
f = open(path, "r")
data = f.read()
f.close()
data = data.encode("utf-8") # type: ignore
hash = hashlib.sha256(data).hexdigest() # type: ignore

gbk = cls(path, hash, source_type)
gbk = cls(path, source_type)

# get record. should only ever be one for Antismash GBK
record: SeqRecord = next(SeqIO.parse(path, "genbank"))
Expand Down Expand Up @@ -559,13 +590,13 @@ def __repr__(self) -> str:
return f"GBK {self.path.name}, {len(self.genes)} genes"

def __hash__(self) -> int:
return hash(self.hash)
return hash(self.path)

def __eq__(self, other) -> bool:
if not isinstance(other, GBK):
return False

if self.hash is None or other.hash is None:
if self.path is None or other.path is None:
return False

return self.hash == other.hash
return self.path == other.path
1 change: 0 additions & 1 deletion big_scape/genbank/proto_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,6 @@ def load_all(candidate_cluster_dict: dict[int, CandidateCluster]):
record_table.c.category,
)
.where(record_table.c.record_type == "protocluster")
.where(record_table.c.parent_id.in_(candidate_cluster_dict.keys()))
.compile()
)

Expand Down
1 change: 0 additions & 1 deletion big_scape/genbank/proto_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ def load_all(protocluster_dict: dict[int, ProtoCluster]):
record_table.c.category,
)
.where(record_table.c.record_type == "proto_core")
.where(record_table.c.parent_id.in_(protocluster_dict.keys()))
.compile()
)

Expand Down
1 change: 0 additions & 1 deletion big_scape/genbank/region.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,6 @@ def load_all(gbk_dict: dict[int, GBK]) -> None:
record_table.c.product,
)
.where(record_table.c.record_type == "region")
.where(record_table.c.gbk_id.in_(gbk_dict.keys()))
.compile()
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n
// construct the graph
for (var i = 0; i < bs_data.length; i++) {
var bs_obj = bs_data[i];
graph.addNode(i, { id: bs_obj["id"], hash: bs_obj["hash"], cl: bs_to_cl[i] });
graph.addNode(i, { id: bs_obj["id"], cl: bs_to_cl[i] });
}
for (var a = 0; a < bs_data.length; a++) {
for (var b = 0; b < bs_data.length; b++) {
Expand Down Expand Up @@ -449,7 +449,7 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n
.attr("stroke", "#777")
.attr("stroke-width", link["data"]["weight"] * 10);

if (graph.getNode(link.fromId).data.hash === graph.getNode(link.toId).data.hash) {
if (graph.getNode(link.fromId).data.id === graph.getNode(link.toId).data.id) {
line = line.attr("stroke-dasharray", "10,10")
}
return line
Expand Down
6 changes: 1 addition & 5 deletions big_scape/output/legacy_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,6 @@ def generate_bs_data_js(
"start: int,
"end": int,
"id": str, (e.g. AL645882.2.cluster010),
"hash": str,
"mibig": bool,
"source": str, (e.g. mibig, reference, or query),
"record_start": int, (e.g. cds boundaries of protocluster, index starts at 1)
Expand Down Expand Up @@ -630,10 +629,7 @@ def generate_bs_data_js(
"desc": organism,
"start": 1,
"end": len(gbk.nt_seq),
"id": "_".join(
[gbk.path.name, type(record).__name__.lower(), str(record.number)]
),
"hash": gbk.hash,
"id": gbk.path.name,
"mibig": gbk.source_type == SOURCE_TYPE.MIBIG,
"source": gbk.source_type.name.lower(),
"record_start": rec_start,
Expand Down
Loading

0 comments on commit c33385e

Please sign in to comment.