Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "Feature/input subset" #82

Merged
merged 1 commit into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 15 additions & 21 deletions big_scape/data/partial_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,8 @@ def find_minimum_task(gbks: list[GBK]):
"""
input_data_state = get_input_data_state(gbks)

# new data or mixed data
if (
input_data_state.value == bs_enums.INPUT_TASK.NEW_DATA.value
or input_data_state.value == bs_enums.INPUT_TASK.MIXED_DATA.value
or input_data_state.value == bs_enums.INPUT_TASK.NO_DATA.value
):
# gbks from input need to be loaded into the in-memory database
return bs_enums.TASK.SAVE_GBKS
if input_data_state.value < bs_enums.INPUT_TASK.SAME_DATA.value:
return bs_enums.TASK.LOAD_GBKS

hmm_data_state = get_hmm_data_state(gbks)

Expand Down Expand Up @@ -68,20 +62,20 @@ def get_input_data_state(gbks: list[GBK]) -> bs_enums.INPUT_TASK:

# get set of gbks in database
db_gbk_rows = DB.execute(gbk_table.select()).all()
db_gbk_hashes: set[str] = {db_gbk_row[2] for db_gbk_row in db_gbk_rows}
input_gbk_hashes: set[str] = {str(gbk.hash) for gbk in gbks}
db_gbk_paths: set[str] = {db_gbk_row[1] for db_gbk_row in db_gbk_rows}
input_gbk_paths: set[str] = {str(gbk.path) for gbk in gbks}

if db_gbk_hashes == input_gbk_hashes:
if db_gbk_paths == input_gbk_paths:
return bs_enums.INPUT_TASK.SAME_DATA

union = db_gbk_hashes & input_gbk_hashes
sym_dif = db_gbk_paths.symmetric_difference(input_gbk_paths)

# all new data
if len(union) == 0:
# still same amount in db. new data
if len(sym_dif) == len(db_gbk_paths):
return bs_enums.INPUT_TASK.NEW_DATA

# only partial data which is already in database
if len(union) == len(input_gbk_hashes):
# same amount in new data. there was more in db than in new data
if len(sym_dif) == len(input_gbk_paths):
return bs_enums.INPUT_TASK.PARTIAL_DATA

# otherwise there is some new data, some old data is missing
Expand All @@ -98,7 +92,7 @@ def get_missing_gbks(gbks: list[GBK]) -> list[GBK]:
list[GBK]: List of GBKs that are missing from the database
"""
# dictionary of gbk path to gbk object
gbk_dict = {str(gbk.hash): gbk for gbk in gbks}
gbk_dict = {str(gbk.path): gbk for gbk in gbks}

if not DB.metadata:
raise RuntimeError("DB.metadata is None")
Expand All @@ -107,13 +101,13 @@ def get_missing_gbks(gbks: list[GBK]) -> list[GBK]:

# get set of gbks in database
db_gbk_rows = DB.execute(gbk_table.select()).all()
db_gbk_hashes: set[int] = {db_gbk_row[2] for db_gbk_row in db_gbk_rows}
db_gbk_paths: set[int] = {db_gbk_row[1] for db_gbk_row in db_gbk_rows}

missing_gbks = []

for gbk_hash in gbk_dict:
if gbk_hash not in db_gbk_hashes:
missing_gbks.append(gbk_dict[gbk_hash])
for gbk_path in gbk_dict:
if gbk_path not in db_gbk_paths:
missing_gbks.append(gbk_dict[gbk_path])

return missing_gbks

Expand Down
3 changes: 1 addition & 2 deletions big_scape/data/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
CREATE TABLE IF NOT EXISTS gbk (
id INTEGER PRIMARY KEY AUTOINCREMENT,
path TEXT,
hash TEXT,
nt_seq TEXT,
organism TEXT,
taxonomy TEXT,
description TEXT,
UNIQUE(hash)
UNIQUE(path)
);

CREATE TABLE IF NOT EXISTS bgc_record (
Expand Down
2 changes: 1 addition & 1 deletion big_scape/enums/partial_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


class TASK(Enum):
SAVE_GBKS = 0
LOAD_GBKS = 0
HMM_SCAN = 1
HMM_ALIGN = 2
COMPARISON = 3
Expand Down
30 changes: 12 additions & 18 deletions big_scape/file_input/load_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,20 +290,20 @@ def load_gbks(run: dict, bigscape_dir: Path) -> list[GBK]:
bs_data.DB.load_from_disk(run["db_path"])
task_state = bs_data.find_minimum_task(input_gbks)

source_dict = {gbk.hash: gbk.source_type for gbk in input_gbks}

# if we are are not on the save_gbks task, we have all the data we need in the database
# and can just load it all into the correct python objects
if task_state != bs_enums.TASK.SAVE_GBKS:
# here we dont save anything to DB, data goes DB -> python objects
# if we are are not on the load_gbks task, we have all the data we need
if task_state != bs_enums.TASK.LOAD_GBKS:
logging.info("Loading existing run from disk...")

input_gbks_from_db = GBK.load_many(input_gbks)
for gbk in input_gbks_from_db:
gbk.source_type = source_dict[gbk.hash]
source_dict = {gbk.path: gbk.source_type for gbk in input_gbks}

gbks_from_db = GBK.load_all()
for gbk in gbks_from_db:
gbk.source_type = source_dict[gbk.path]

for gbk in gbks_from_db:
bs_hmm.HSP.load_all(gbk.genes)

return input_gbks_from_db
return gbks_from_db

# if we end up here, we are in some halfway state and need to load in the new data
logging.info("Loading existing run from disk and adding new data...")
Expand All @@ -313,11 +313,5 @@ def load_gbks(run: dict, bigscape_dir: Path) -> list[GBK]:
for gbk in missing_gbks:
gbk.save_all()

# now we have all new data in the database, we can load it all in to the correct
# python objects
input_gbks_from_db = GBK.load_many(input_gbks)
for gbk in input_gbks_from_db:
gbk.source_type = source_dict[gbk.hash]
bs_hmm.HSP.load_all(gbk.genes)

return input_gbks_from_db
# still return the full set
return input_gbks
1 change: 0 additions & 1 deletion big_scape/genbank/candidate_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,6 @@ def load_all(region_dict: dict[int, Region]):
record_table.c.product,
)
.where(record_table.c.record_type == "cand_cluster")
.where(record_table.c.parent_id.in_(region_dict.keys()))
.compile()
)

Expand Down
1 change: 0 additions & 1 deletion big_scape/genbank/cds.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,6 @@ def load_all(gbk_dict: dict[int, GBK]) -> None:
cds_table.c.aa_seq,
)
.order_by(cds_table.c.orf_num)
.where(cds_table.c.gbk_id.in_(gbk_dict.keys()))
.compile()
)

Expand Down
77 changes: 54 additions & 23 deletions big_scape/genbank/gbk.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
# from enum import Enum
from pathlib import Path
from typing import Dict, Optional
import hashlib


# from dependencies
Expand Down Expand Up @@ -48,9 +47,8 @@ class GBK:
source_type: SOURCE_TYPE
"""

def __init__(self, path, hash, source_type) -> None:
def __init__(self, path, source_type) -> None:
self.path: Path = path
self.hash: str = hash
self.metadata: Dict[str, str] = {}
self.region: Optional[Region] = None
self.nt_seq: SeqRecord.seq = None
Expand Down Expand Up @@ -160,7 +158,6 @@ def save(self, commit=True) -> None:
gbk_table.insert()
.values(
path=str(self.path),
hash=str(self.hash),
nt_seq=str(self.nt_seq),
organism=organism,
taxonomy=taxonomy,
Expand Down Expand Up @@ -217,7 +214,6 @@ def load_all() -> list[GBK]:
gbk_table.select()
.add_columns(
gbk_table.c.id,
gbk_table.c.hash,
gbk_table.c.path,
gbk_table.c.nt_seq,
gbk_table.c.organism,
Expand All @@ -231,7 +227,7 @@ def load_all() -> list[GBK]:

gbk_dict = {}
for result in cursor_result.all():
new_gbk = GBK(Path(result.path), result.hash, "")
new_gbk = GBK(Path(result.path), "")
new_gbk._db_id = result.id
new_gbk.nt_seq = result.nt_seq
new_gbk.metadata["organism"] = result.organism
Expand All @@ -249,7 +245,51 @@ def load_all() -> list[GBK]:
return list(gbk_dict.values())

@staticmethod
def load_many(input_gbks: list[GBK]) -> list[GBK]:
def load_one(gbk_id: int) -> GBK:
"""Load a single GBK object from the database

Args:
gbk_id (int): id of gbk to load

Returns:
GBK: loaded GBK object
"""

if not DB.metadata:
raise RuntimeError("DB.metadata is None")

gbk_table = DB.metadata.tables["gbk"]
select_query = (
gbk_table.select()
.add_columns(
gbk_table.c.id,
gbk_table.c.path,
gbk_table.c.source_type,
gbk_table.c.nt_seq,
gbk_table.c.organism,
gbk_table.c.taxonomy,
gbk_table.c.description,
)
.where(gbk_table.c.id == gbk_id)
.compile()
)

result = DB.execute(select_query).fetchone()

if result is None:
raise RuntimeError(f"No GBK with id {gbk_id}")

new_gbk = GBK(Path(result.path), result.source_type)
new_gbk._db_id = result.id
new_gbk.nt_seq = result.nt_seq
new_gbk.metadata["organism"] = result.organism
new_gbk.metadata["taxonomy"] = result.taxonomy
new_gbk.metadata["description"] = result.description

return new_gbk

@staticmethod
def load_many(gbk_ids: list[int]) -> list[GBK]:
"""Load a list of GBK objects from the database

Args:
Expand All @@ -259,8 +299,6 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
list[GBK]: loaded GBK objects
"""

input_gbk_hashes = [gbk.hash for gbk in input_gbks]

if not DB.metadata:
raise RuntimeError("DB.metadata is None")

Expand All @@ -269,22 +307,22 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
gbk_table.select()
.add_columns(
gbk_table.c.id,
gbk_table.c.hash,
gbk_table.c.path,
gbk_table.c.source_type,
gbk_table.c.nt_seq,
gbk_table.c.organism,
gbk_table.c.taxonomy,
gbk_table.c.description,
)
.where(gbk_table.c.hash.in_(input_gbk_hashes))
.where(gbk_table.c.id.in_(gbk_ids))
.compile()
)

cursor_result = DB.execute(select_query)

gbk_dict = {}
for result in cursor_result.all():
new_gbk = GBK(Path(result.path), result.hash, "")
new_gbk = GBK(Path(result.path), result.source_type)
new_gbk._db_id = result.id
new_gbk.nt_seq = result.nt_seq
new_gbk.metadata["organism"] = result.organism
Expand Down Expand Up @@ -342,14 +380,7 @@ def parse(
GBK: GBK object
"""

# get unique content hash
f = open(path, "r")
data = f.read()
f.close()
data = data.encode("utf-8") # type: ignore
hash = hashlib.sha256(data).hexdigest() # type: ignore

gbk = cls(path, hash, source_type)
gbk = cls(path, source_type)

# get record. should only ever be one for Antismash GBK
record: SeqRecord = next(SeqIO.parse(path, "genbank"))
Expand Down Expand Up @@ -559,13 +590,13 @@ def __repr__(self) -> str:
return f"GBK {self.path.name}, {len(self.genes)} genes"

def __hash__(self) -> int:
return hash(self.hash)
return hash(self.path)

def __eq__(self, other) -> bool:
if not isinstance(other, GBK):
return False

if self.hash is None or other.hash is None:
if self.path is None or other.path is None:
return False

return self.hash == other.hash
return self.path == other.path
1 change: 0 additions & 1 deletion big_scape/genbank/proto_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,6 @@ def load_all(candidate_cluster_dict: dict[int, CandidateCluster]):
record_table.c.category,
)
.where(record_table.c.record_type == "protocluster")
.where(record_table.c.parent_id.in_(candidate_cluster_dict.keys()))
.compile()
)

Expand Down
1 change: 0 additions & 1 deletion big_scape/genbank/proto_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ def load_all(protocluster_dict: dict[int, ProtoCluster]):
record_table.c.category,
)
.where(record_table.c.record_type == "proto_core")
.where(record_table.c.parent_id.in_(protocluster_dict.keys()))
.compile()
)

Expand Down
1 change: 0 additions & 1 deletion big_scape/genbank/region.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,6 @@ def load_all(gbk_dict: dict[int, GBK]) -> None:
record_table.c.product,
)
.where(record_table.c.record_type == "region")
.where(record_table.c.gbk_id.in_(gbk_dict.keys()))
.compile()
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n
// construct the graph
for (var i = 0; i < bs_data.length; i++) {
var bs_obj = bs_data[i];
graph.addNode(i, { id: bs_obj["id"], hash: bs_obj["hash"], cl: bs_to_cl[i] });
graph.addNode(i, { id: bs_obj["id"], cl: bs_to_cl[i] });
}
for (var a = 0; a < bs_data.length; a++) {
for (var b = 0; b < bs_data.length; b++) {
Expand Down Expand Up @@ -449,7 +449,7 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n
.attr("stroke", "#777")
.attr("stroke-width", link["data"]["weight"] * 10);

if (graph.getNode(link.fromId).data.hash === graph.getNode(link.toId).data.hash) {
if (graph.getNode(link.fromId).data.id === graph.getNode(link.toId).data.id) {
line = line.attr("stroke-dasharray", "10,10")
}
return line
Expand Down
6 changes: 1 addition & 5 deletions big_scape/output/legacy_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,6 @@ def generate_bs_data_js(
"start: int,
"end": int,
"id": str, (e.g. AL645882.2.cluster010),
"hash": str,
"mibig": bool,
"source": str, (e.g. mibig, reference, or query),
"record_start": int, (e.g. cds boundaries of protocluster, index starts at 1)
Expand Down Expand Up @@ -630,10 +629,7 @@ def generate_bs_data_js(
"desc": organism,
"start": 1,
"end": len(gbk.nt_seq),
"id": "_".join(
[gbk.path.name, type(record).__name__.lower(), str(record.number)]
),
"hash": gbk.hash,
"id": gbk.path.name,
"mibig": gbk.source_type == SOURCE_TYPE.MIBIG,
"source": gbk.source_type.name.lower(),
"record_start": rec_start,
Expand Down
Loading
Loading