Skip to content

Commit

Permalink
fix result generation crash on multiple cutoffs
Browse files Browse the repository at this point in the history
  • Loading branch information
adraismawur committed Apr 19, 2024
1 parent af9db71 commit 66f8820
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 47 deletions.
115 changes: 71 additions & 44 deletions big_scape/output/legacy_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,9 +224,7 @@ def generate_run_data_js(
"classify": (
"Legacy Groups"
if run["legacy_classify"]
else run["classify"].name.title()
if run["classify"]
else "Not Classify"
else run["classify"].name.title() if run["classify"] else "Not Classify"
),
"weights": "Legacy Weights" if run["legacy_weights"] else "Mix",
"alignment_mode": run["alignment_mode"].name.title(),
Expand Down Expand Up @@ -873,9 +871,9 @@ def adjust_lcs_to_full_region(


def generate_bs_family_alignment(
family_db_id: int,
family_members: list[int],
records: list[BGCRecord],
exemplar_records_list_id: int,
family_member_records_list_ids: list[int],
records_list: list[BGCRecord],
edge_param_id: int,
tree_path: Path,
):
Expand All @@ -891,18 +889,22 @@ def generate_bs_family_alignment(
Returns:
dict[str, Any]: alignment of family members
"""
family_name = "FAM_{:05d}".format(family_db_id)
family_name = "FAM_{:05d}".format(exemplar_records_list_id)
logging.debug("Generating alignment for %s", family_name)
# collects records within this GCF
family_records = [records[bgc_num] for bgc_num in family_members]
family_member_db_ids = [rec._db_id for rec in family_records]
fam_record_idx = family_member_db_ids.index(family_db_id)
fam_record = family_records[fam_record_idx]

family_member_db_ids = [
rec._db_id
for idx, rec in enumerate(records_list)
if idx in family_member_records_list_ids
]

exemplar_record = records_list[exemplar_records_list_id]

ref_genes_ = set()
aln = []
for bgc, bgc_db_id in zip(family_members, family_member_db_ids):
bgc_record = records[bgc]
for bgc, bgc_db_id in zip(family_member_records_list_ids, family_member_db_ids):
bgc_record = records_list[bgc]
bgc_gbk = bgc_record.parent_gbk
if bgc_gbk is None:
raise AttributeError("Record parent GBK is not set!")
Expand All @@ -911,28 +913,31 @@ def generate_bs_family_alignment(

bgc_domains = bgc_record.get_hsps()

if bgc_db_id == family_db_id:
if bgc_db_id == exemplar_record._db_id:
# skip exemplar-exemplar comparison
continue
elif bgc == exemplar_records_list_id:
aln.append([[dom_num, 0] for dom_num in range(len(bgc_domains))])
continue

# records from the same region will not already have an lcs computed,
# should they end up in the same family we can try to align them
elif bgc_gbk == fam_record.parent_gbk:
elif bgc_gbk == exemplar_record.parent_gbk:
a_start, b_start, reverse = align_subrecords(
fam_record.get_hsps(), bgc_domains
exemplar_record.get_hsps(), bgc_domains
)

else:
lcs_data = fetch_lcs_from_db(
family_db_id,
exemplar_record._db_id,
bgc_db_id,
edge_param_id,
)

a_start, b_start, reverse = adjust_lcs_to_family_reference(
lcs_data,
family_db_id,
len(fam_record.get_hsps()),
exemplar_record._db_id,
len(exemplar_record.get_hsps()),
len(bgc_domains),
)

Expand All @@ -954,9 +959,13 @@ def generate_bs_family_alignment(
logging.debug("Generating newick tree for %s", family_name)
fam_alignment = {
"id": family_name,
"ref": family_members[fam_record_idx],
"ref": exemplar_records_list_id,
"newick": generate_newick_tree(
family_records, fam_record_idx, family_members, family_name, tree_path
records_list,
exemplar_records_list_id,
family_member_records_list_ids,
family_name,
tree_path,
),
"ref_genes": ref_genes,
"aln": aln,
Expand Down Expand Up @@ -1035,24 +1044,32 @@ def generate_bs_families_members(
"""Generate a dictionary where keys are family indexes and values are list of region
indexes that belong to that family
indexes correspond to the index of the record in a full list of records (in a bin)
this is very confusing, so this desperately needs to be refactored
Args:
cutoff (float): cutoff value
pair_generator (BGCBin): BGC pair_generator
Returns:
dict[int, list[int]]: family to member regions index
"""
# get a dictionary of node id to family id
node_family = {}

families_members: dict[int, list[int]] = {}

# first make a dictionary of db id to record_list id
db_id_to_record_idx = {}
for idx, record in enumerate(pair_generator.source_records):
db_id_to_record_idx[record._db_id] = idx

if not DB.metadata:
raise RuntimeError("DB.metadata is None")

# get all families from the database
# first get all familes and their nodes
family_table = DB.metadata.tables["family"]
bgc_families_table = DB.metadata.tables["bgc_record_family"]

select_statement = (
family_select_statement = (
select(
bgc_families_table.c.record_id,
family_table.c.center_id,
Expand All @@ -1065,29 +1082,41 @@ def generate_bs_families_members(
.where(family_table.c.bin_label == pair_generator.label)
)

result = DB.execute(select_statement).fetchall()
result = DB.execute(family_select_statement).fetchall()

for row in result:
node_family[row.record_id] = row.center_id

families_members: dict[int, list[int]] = {}

for idx, record in enumerate(pair_generator.source_records):
record_id = record._db_id
center_id_list_idx = db_id_to_record_idx[row.center_id]
record_id_list_idx = db_id_to_record_idx[row.record_id]
families_members.setdefault(center_id_list_idx, []).append(record_id_list_idx)

if record_id is None:
raise AttributeError("Record id is None!")
# add singletons

if record_id not in node_family:
families_members[record_id] = [idx]
continue
record_table = DB.metadata.tables["bgc_record"]

family_id = node_family[record_id]
singleton_select_statement = (
select(record_table.c.id)
.where(record_table.c.id.in_(pair_generator.record_ids))
.where(
record_table.c.id.not_in(
select(
bgc_families_table.c.record_id,
)
.join(family_table, bgc_families_table.c.family_id == family_table.c.id)
.where(bgc_families_table.c.record_id.in_(pair_generator.record_ids))
.where(family_table.c.cutoff == cutoff)
.where(family_table.c.bin_label == pair_generator.label)
)
)
)

if family_id not in families_members:
families_members[family_id] = []
result = DB.execute(singleton_select_statement).fetchall()

families_members[family_id].append(idx)
for row in result:
if row.id in families_members:
logging.warning("Singleton record %s is already in a family", row.id)
continue
singleton_list_idx = db_id_to_record_idx[row.id]
families_members[singleton_list_idx] = [singleton_list_idx]

return families_members

Expand Down Expand Up @@ -1699,9 +1728,7 @@ def write_full_network_file(run: dict, all_bgc_records: list[BGCRecord]) -> None
write_network_file(full_network_file_path, edgelist)


def get_full_network_edgelist(
run: dict, all_bgc_records: list
) -> set[
def get_full_network_edgelist(run: dict, all_bgc_records: list) -> set[
tuple[
str,
str,
Expand Down
9 changes: 6 additions & 3 deletions big_scape/trees/newick_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def find_tree_domains(


def generate_gcf_alignment(
records: list[BGCRecord], exemplar: int, family_members: list[int]
all_records: list[BGCRecord], all_exemplar: int, all_family_members: list[int]
) -> str:
"""Generate protein domain alignment for records in GCF
Expand All @@ -148,6 +148,9 @@ def generate_gcf_alignment(
str: alignment of GCF based on protein domain
TODO: refactor
"""
records = [all_records[i] for i in all_family_members]
exemplar = all_family_members.index(all_exemplar)

record_ids = list(range(len(records)))

# collect present domains for each GCF member
Expand Down Expand Up @@ -259,8 +262,8 @@ def generate_gcf_alignment(
for bgc in delete_bgc:
del alignments[bgc]

algn_string = f">{family_members[exemplar]}\n{alignments[exemplar]}\n"
algn_string = f">{all_family_members[exemplar]}\n{alignments[exemplar]}\n"
for bgc in alignments:
if bgc != exemplar:
algn_string += f">{family_members[bgc]}\n{alignments[bgc]}\n"
algn_string += f">{all_family_members[bgc]}\n{alignments[bgc]}\n"
return algn_string

0 comments on commit 66f8820

Please sign in to comment.