From a476f06fbd67b08a628c96de7609568a24df3509 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gwendolyn=20O=2E=20D=C3=B6bel?= <81755070+GwennyGit@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:19:32 +0200 Subject: [PATCH] Adjusted gapfill.py & entities.py #52 #126 - Adjusted due to new db_access set up - Started writing code for mapping from BioCyc IDs to other databases for the BioCycGapFiller --- src/refinegems/classes/gapfill.py | 179 ++++++++---- src/refinegems/utility/entities.py | 421 +++-------------------------- 2 files changed, 176 insertions(+), 424 deletions(-) diff --git a/src/refinegems/classes/gapfill.py b/src/refinegems/classes/gapfill.py index 195edef..5c321df 100644 --- a/src/refinegems/classes/gapfill.py +++ b/src/refinegems/classes/gapfill.py @@ -1,5 +1,6 @@ #!/usr/bin/env python # @TODO: Correct/Rewrite doc string +# @TEST - new runtimes """Add more reactions, genes and more to a model based on different gap-filling methods. The `gapfill` module can be used either with KEGG were you only need the KEGG organism code or with BioCyc or with both (Options: 'KEGG', 'BioCyc', 'KEGG+BioCyc'). For how to obtain the BioCyc tables look into the documentation under: 'Main modules' > 'Gap filling' > 'Automated gap filling'. @@ -19,13 +20,12 @@ from abc import ABC, abstractmethod -import ast import cobra import io -from matplotlib import axis import numpy as np import pandas as pd import re +import sqlite3 import warnings from bioservices.kegg import KEGG @@ -38,10 +38,11 @@ from tqdm import tqdm tqdm.pandas() -from ..curation.db_access.db import get_ec_from_ncbi, get_ec_via_swissprot +from ..utility.databases import PATH_TO_DB +from ..utility.db_access import get_ec_from_ncbi, get_ec_via_swissprot, parse_KEGG_gene, parse_KEGG_ec, add_stoichiometric_values_to_reacs_from_bigg from ..utility.io import load_a_table_from_database, parse_gff_for_cds, load_model, write_model_to_file -from ..utility.entities import create_gp, create_gpr, build_reaction_bigg, build_reaction_biocyc, build_reaction_kegg, build_reaction_mnx, isreaction_complete -from ..curation.db_access.kegg import parse_KEGG_gene, parse_KEGG_ec +from ..utility.entities import create_gp, create_gpr, build_reaction_bigg, build_reaction_kegg, build_reaction_mnx, isreaction_complete, extract_metabolites_from_reactions +from ..utility.util import VALID_COMPARTMENTS from ..developement.decorators import * ##### Coloured text: @@ -55,7 +56,7 @@ def coloured_example_text(): 'If no entry for your organism exists in KEGG and/or BioCyc, the gap analysis cannot be done.') -# Note: +# @Note: # some reactions have @DEBUGGING # enable these parts to shorten runtime during debugging (used to work on a subset, # not on the whole input) @@ -66,9 +67,121 @@ def coloured_example_text(): ############################################################################ -# where to put +# functions ############################################################################ +# @DISCUSSION: Keep for end user? Use somewhere else? Or deprecate? +# Function originally from refineGEMs.genecomp/refineGEMs.KEGG_analysis --- Modified +def compare_bigg_model(complete_df: pd.DataFrame, model_entities: pd.DataFrame, metabolites: bool=False) -> pd.DataFrame: + """Compares missing entities obtained through genes extracted via KEGG/BioCyc to entities in the model + Needed to back check previous comparisons. + + Args: + - complete_df (pd.DataFrame): + Table that contains KEGG/BioCyc Id, BiGG Id & more + - model_entities (pd.DataFrame): + BiGG Ids of entities in the model + - metabolites (bool): + True if names of metabolites should be added, otherwise false + + Returns: + pd.DataFrame: + Table containing entities present in KEGG/BioCyc but not in the model + """ + db = 'KEGG' if 'KEGG' in complete_df.columns else 'BioCyc' # Find out which database was used + + # Get only IDs that are not in model + mapp = complete_df.set_index('bigg_id') + entities = model_entities.set_index('bigg_id') + entities_missing_in_model = mapp[~mapp.index.isin( + entities.index)].reset_index() + + db_ids = entities_missing_in_model.groupby('bigg_id')[db].agg(set) # Get a set of all BioCyc/KEGG IDs belonging to one BiGG ID + + # Add set of BioCyc/KEGG IDs belonging to one BiGG ID to the dataframe + entities_missing_in_model.set_index('bigg_id', inplace=True) + entities_missing_in_model.loc[:, db] = db_ids + entities_missing_in_model.reset_index(inplace=True) + + if 'id_group' in entities_missing_in_model.columns: # Remove reaction ID duplicates but keep all related BiGG & BioCyc/KEGG IDs in a list + aliases = entities_missing_in_model.groupby(['compartment', 'id_group'])['bigg_id'].agg(set) # Get a set of the 'duplicated' BiGG reaction IDs -> aliases + entities_missing_in_model.drop_duplicates(['compartment', 'id_group'], inplace=True, ignore_index=True) # Drop duplicates where compartments & id_group same + + # Add set of BiGG ID aliases to the dataframe + entities_missing_in_model.set_index(['compartment', 'id_group'], inplace=True) + entities_missing_in_model.loc[:, 'bigg_aliases'] = aliases + entities_missing_in_model.reset_index(inplace=True) + + entities_missing_in_model.drop(labels='id_group', axis=1, inplace=True) # id_group is not longer necessary + + entities_missing_in_model.drop_duplicates(subset='bigg_id', inplace=True, ignore_index=True) # Remove BiGG ID duplicates + + # Add name column to dataframe + def get_name_from_bigg(bigg_id: str): + bigg_db = 'bigg_metabolites' if metabolites else 'bigg_reactions' + query = f"SELECT name FROM {bigg_db} WHERE bigg_id=\'{bigg_id}\'" + name_from_bigg = con.execute(query).fetchone()[0] + return name_from_bigg + + con = sqlite3.connect(PATH_TO_DB) # Open connection to database + entities_missing_in_model['name'] = entities_missing_in_model['bigg_id'].map(get_name_from_bigg) + con.close() + + # Add compartment ID to all BiGG metabolites + if metabolites: + def get_compartment_from_id(bigg_id: str): + compartment = bigg_id[-1] + return compartment if compartment in VALID_COMPARTMENTS.keys() else np.nan # To filter the incorrect compartments out + + entities_missing_in_model['compartment'] = entities_missing_in_model.apply( + lambda row: get_compartment_from_id(row['bigg_id']), axis=1) + entities_missing_in_model.dropna(subset=['compartment'], inplace=True) # Drop all BiGG metabolite IDs which have no valid compartment + + return entities_missing_in_model + + +# Mapping for BioCyc Reactions +# ---------------------------- +def map_biocyc_to_reac(biocyc_reacs: pd.DataFrame) -> tuple[pd.DataFrame, dict[str: int]]: + + statistics = { + 'mapped2MNX': 0, + 'mapped2BiGG': 0, + 'mapped2KEGG': 0, + 'remaining_unmapped': 0 + } + + # Drop NaNs in relevant columns + biocyc_reacs.dropna(subset=['id', 'ec-code'], inplace=True) + + # Mapping to BiGG + addition of more information on BiGG Reactions + # Get BiGG BioCyc + bigg2biocyc_reacs = load_a_table_from_database('SELECT id, BioCyc from bigg_reactions') + + # Subset missing_reactions with BiGG BioCyc + bigg2biocyc_reacs.rename({'id': 'BiGG', 'BioCyc': 'id'}, inplace=True) + # @TODO: collect non-matched entries // return all // namespace independance ???? + biocyc_reacs = bigg2biocyc_reacs.merge(biocyc_reacs, on='BioCyc', how='right') + + # Get amount of missing reactions that have a BiGG ID + statistics['Reaction']['Have BiGG ID'] = len(biocyc_reacs['BioCyc'].unique().tolist()) + + # Get amount of missing reactions that are not in the model + statistics['Reaction']['Can be added'] = len(missing_reactions['bigg_id'].unique().tolist()) + + # Add reactants & products dictionary with stoichiometric values to the reactions table + missing_reactions = add_stoichiometric_values_to_reacs_from_bigg(missing_reactions) + + # Get all metabolites for the missing reactions + biocyc_metabs_from_reacs, bigg_metabs_from_reacs = extract_metabolites_from_reactions(missing_reactions) + + # @NOTE + # @TODO + missing_bc_reacs = 'mapped_reacs + unmapped_reacs' + + return missing_bc_reacs, statistics + + # Mapping of EC numbers # --------------------- @@ -176,7 +289,7 @@ def _map_ec_to_reac_kegg(unmapped_reacs:pd.DataFrame) -> pd.DataFrame: """ # get KEGG EC number information - kegg_mapped = pd.DataFrame.from_dict(list(unmapped_reacs.progress_apply(parse_KEGG_ec, desc='Mapping reacs to KEGG'))) + kegg_mapped = pd.DataFrame.from_dict(list(unmapped_reacs.progress_apply(parse_KEGG_ec))) kegg_mapped['is_transport'] = None kegg_mapped['via'] = kegg_mapped['id'].apply(lambda x: 'KEGG' if x else None) kegg_mapped.explode(column='id') @@ -637,9 +750,9 @@ def fill_model(self, model:Union[cobra.Model,libModel], # ....................... # @DEBUGGING - if len(self.missing_reacs) > 10: - self.missing_reacs = self.missing_reacs.sample(10) - print('fill_model: Running in debugging mode') + # if len(self.missing_reacs) > 10: + # self.missing_reacs = self.missing_reacs.sample(10) + # print('fill_model: Running in debugging mode') # ....................... # add reactions to model @@ -692,6 +805,10 @@ class KEGGapFiller(GapFiller): """Based on a KEGG organism ID (corresponding to the organism of the model), find missing genes in the model and map them to reactions to try and fill the gaps found with the KEGG database. + + .. note:: + + Due to the KEGG REST API this is relatively slow. Attributes: @@ -767,8 +884,8 @@ def get_model_genes(model: libModel) -> pd.DataFrame: # Step 5: map to EC via KEGG # -------------------------- # @DEBUGGING ................... - genes_not_in_model = genes_not_in_model.iloc[330:350,:] - print(UserWarning('Running in debugging mode.')) + # genes_not_in_model = genes_not_in_model.iloc[330:350,:] + # print(UserWarning('Running in debugging mode.')) # .............................. geneKEGG_mapping = pd.DataFrame.from_dict(list(genes_not_in_model['orgid:locus'].progress_apply(parse_KEGG_gene))) genes_not_in_model = genes_not_in_model.merge(geneKEGG_mapping, how='left', on='orgid:locus') @@ -1232,8 +1349,8 @@ def find_missing_reacs(self, model:cobra.Model, # ----------------------------------- # -> access ncbi for ec (optional) # @DEBUGGING ................... - mapped_reacs = mapped_reacs.iloc[300:350,:] - print(UserWarning('Running in debugging mode.')) + # mapped_reacs = mapped_reacs.iloc[300:350,:] + # print(UserWarning('Running in debugging mode.')) # .............................. if check_NCBI and mail: mapped_reacs['ec-code'] = mapped_reacs.progress_apply(lambda x: get_ec_from_ncbi(mail,x['ncbiprotein']) if not x['ec-code'] and not x['ncbiprotein'].isna() else x['ec-code'], axis=1) @@ -1294,37 +1411,7 @@ def find_missing_reacs(self, model:cobra.Model, ############################################################################ # For filtering ############################################################################ -# Evtl hier: -''' -biocyc_reacs.dropna( - subset=['id', 'Reactants', 'Products', 'EC'], inplace=True - ) -''' - -''' Mapping to BiGG + addition of more information on BiGG Reactions -# Get BiGG BioCyc -bigg2biocyc_reacs = get_bigg_db_mapping('BioCyc',False) - -# Subset missing_reactions with BiGG BioCyc -missing_reactions.rename(columns={'Reaction': 'BioCyc'}, inplace=True) -# @TODO: collect non-matched entries // return all // namespace independance ???? -missing_reactions = bigg2biocyc_reacs.merge(missing_reactions, on='BioCyc') - -# Get amount of missing reactions that have a BiGG ID -self._statistics['Reaction']['Have BiGG ID'] = len(missing_reactions['BioCyc'].unique().tolist()) - -# Subset missing_reactions with model_reacs -missing_reactions = compare_bigg_model(missing_reactions, model_reacs) - -# Get amount of missing reactions that are not in the model -self._statistics['Reaction']['Can be added'] = len(missing_reactions['bigg_id'].unique().tolist()) - -# Add reactants & products dictionary with stoichiometric values to the reactions table -missing_reactions = add_stoichiometric_values_to_reacs(missing_reactions) - -# Get all metabolites for the missing reactions -biocyc_metabs_from_reacs, bigg_metabs_from_reacs = extract_metabolites_from_reactions(missing_reactions) -''' +# Evtl hier: # Inspired by Dr. Reihaneh Mostolizadeh's function to add BioCyc reactions to a model def replace_reaction_direction_with_fluxes(missing_reacs: pd.DataFrame) -> pd.DataFrame: diff --git a/src/refinegems/utility/entities.py b/src/refinegems/utility/entities.py index df7e036..197ee80 100644 --- a/src/refinegems/utility/entities.py +++ b/src/refinegems/utility/entities.py @@ -9,10 +9,9 @@ # requirements ################################################################################ +import ast import cobra import json -import logging -import memote.support.helpers as helpers import pandas as pd import re import requests @@ -23,26 +22,25 @@ from Bio.KEGG import REST, Compound from libsbml import Model as libModel from libsbml import GeneProduct, Species, Reaction, FbcOr, FbcAnd, GeneProductRef -from memote.utils import truncate -from random import choice -from six import iteritems + +from random import choice from string import ascii_uppercase, digits from typing import Union, Literal +from tqdm import tqdm + +tqdm.pandas() +pd.options.mode.chained_assignment = None # suppresses the pandas SettingWithCopyWarning; comment out before developing!! -from .cvterms import add_cv_term_genes, add_cv_term_metabolites, add_cv_term_reactions -from .io import search_ncbi_for_gpr, load_a_table_from_database, kegg_reaction_parser +from .cvterms import add_cv_term_genes, add_cv_term_metabolites, add_cv_term_reactions, _add_annotations_from_dict_cobra +from .db_access import search_ncbi_for_gpr, kegg_reaction_parser, _add_annotations_from_bigg_reac_row, get_BiGG_metabs_annot_via_dbid, add_annotations_from_BiGG_metabs +from .io import load_a_table_from_database +from .util import COMP_MAPPING, VALID_COMPARTMENTS from ..developement.decorators import * ################################################################################ # variables ################################################################################ -# compartments -# ------------ -VALID_COMPARTMENTS = {'c': 'cytosol', 'e': 'extracellular space', 'p':'periplasm','y':'unknown compartment'} #: :meta: -COMP_MAPPING = {'c': 'c', 'e': 'e', 'p': 'p', - 'C_c': 'c', 'C_e': 'e', 'C_p': 'p', - '':'y'} #: :meta: ################################################################################ # functions @@ -103,144 +101,6 @@ def resolve_compartment_names(model:cobra.Model): raise KeyError(F'Unknown compartment {[_ for _ in model.compartments if _ not in COMP_MAPPING.keys()]} detected. Cannot resolve problem.') -# handling biomass reaction -# ------------------------- -def test_biomass_presence(model: cobra.Model) -> Union[list[str], None]: - """ - Modified from MEMOTE: https://github.com/opencobra/memote/blob/81a55a163262a0e06bfcb036d98e8e551edc3873/src/memote/suite/tests/test_biomass.py#LL42C3-L42C3 - - Expect the model to contain at least one biomass reaction. - - The biomass composition aka biomass formulation aka biomass reaction - is a common pseudo-reaction accounting for biomass synthesis in - constraints-based modelling. It describes the stoichiometry of - intracellular compounds that are required for cell growth. While this - reaction may not be relevant to modeling the metabolism of higher - organisms, it is essential for single-cell modeling. - - Implementation: - Identifies possible biomass reactions using two principal steps: - - 1. Return reactions that include the SBO annotation "SBO:0000629" for - biomass. - - 2. If no reactions can be identified this way: - - 1. Look for the ``buzzwords`` "biomass", "growth" and "bof" in reaction IDs. - 2. Look for metabolite IDs or names that contain the ``buzzword`` "biomass" and obtain the set of reactions they are involved in. - 3. Remove boundary reactions from this set. - 4. Return the union of reactions that match the buzzwords and of the reactions that metabolites are involved in that match the buzzword. - - This test checks if at least one biomass reaction is present. - - If no reaction can be identified return None. - - """ - biomass_rxn = [rxn.id for rxn in helpers.find_biomass_reaction(model)] - outcome = len(biomass_rxn) > 0 - logging.info( - """In this model the following {} biomass reaction(s) were - identified: {}""".format( - len(biomass_rxn), truncate(biomass_rxn) - ) - ) - if outcome: return biomass_rxn - else: return None - - -def sum_biomass_weight(reaction: Reaction) -> float: - """ - From MEMOTE: https://github.com/opencobra/memote/blob/81a55a163262a0e06bfcb036d98e8e551edc3873/src/memote/support/biomass.py#L95 - - Compute the sum of all reaction compounds. - - This function expects all metabolites of the biomass reaction to have - formula information assigned. - - Args: - - reaction (Reaction): - The biomass reaction of the model under investigation. - - Returns: - float: - The molecular weight of the biomass reaction in units of g/mmol. - """ - return ( - sum( - -coef * met.formula_weight - for (met, coef) in iteritems(reaction.metabolites) - ) - / 1000.0 - ) - - -def test_biomass_consistency(model: cobra.Model, reaction_id: str) -> Union[float, str]: - """ - Modified from MEMOTE: https://github.com/opencobra/memote/blob/81a55a163262a0e06bfcb036d98e8e551edc3873/src/memote/suite/tests/test_biomass.py#L89 - - Expect biomass components to sum up to 1 g[CDW]. - - This test only yields sensible results if all biomass precursor - metabolites have chemical formulas assigned to them. - The molecular weight of the biomass reaction in metabolic models is - defined to be equal to 1 g/mmol. Conforming to this is essential in order - to be able to reliably calculate growth yields, to cross-compare models, - and to obtain valid predictions when simulating microbial consortia. A - deviation from 1 - 1E-03 to 1 + 1E-06 is accepted. - - Implementation: - Multiplies the coefficient of each metabolite of the biomass reaction with - its molecular weight calculated from the formula, then divides the overall - sum of all the products by 1000. - - Args: - - model(cobraModel): - The model loaded with COBRApy. - - reaction_id(str): - Reaction ID of a BOF. - - Returns: - (1) Case: problematic input - str: - an error message. - - (2) Case: successful testing - float: - biomass weight - """ - reaction = model.reactions.get_by_id(reaction_id) - try: - biomass_weight = sum_biomass_weight(reaction) - except TypeError: - message = """ - One or more of the biomass components do not have a defined formula or contain unspecified chemical groups. - The biomass overall weight could thus not be calculated. - """ - return message - else: - if ((1 - 1e-03) < biomass_weight < (1 + 1e-06)): - logging.info( - """The component molar mass of the biomass reaction {} sums up to {} - which is inside the 1e-03 margin from 1 mmol / g[CDW] / h. - """.format( - reaction_id, biomass_weight - ) - ) - else: - logging.warning( - """The component molar mass of the biomass reaction {} sums up to {} - which is outside of the 1e-03 margin from 1 mmol / g[CDW] / h. - """.format( - reaction_id, biomass_weight - ) - ) - #outcome = (1 - 1e-03) < biomass_weight < (1 + 1e-06) -> Need to implement that for check - # To account for numerical inaccuracies, a range from 1-1e0-3 to 1+1e-06 - # is implemented in the assertion check - return biomass_weight - - - # handling cobra entities (features) # ---------------------------------- @@ -369,6 +229,36 @@ def create_random_id(model:cobra.Model, entity_type:Literal['reac','meta']='reac j = j + 1 +# @NOTE: Add to reaction handling for `build_reac_bigg`? +def extract_metabolites_from_reactions(missing_reactions: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: + """Extracts a set of all reactants & products from the missing reactions + + Args: + - missing_reactions (pd.DataFrame): + Table containing all missing reactions found through the + missing genes + + Returns: + tuple: + Two tables (1) & (2) + + (1) pd.DataFrame: Table with the column Compound containing all compounds required for the missing BioCyc reactions + (2) pd.DataFrame: Table with the column bigg_id containing all compounds required for the missing BiGG reactions + """ + # Get all BioCyc metabolites necessary for the BioCyc reactions + biocyc_reactants = [r for row in missing_reactions['Reactants'] for r in row] + biocyc_products = [p for row in missing_reactions['Products'] for p in row] + biocyc_metabolites = list(set([*biocyc_reactants, *biocyc_products])) + + # Get all BiGG metabolites necessary for the BiGG reactions + bigg_reactions = [ast.literal_eval(str(reac_dict)) for reac_dict in missing_reactions['bigg_reaction']] + bigg_reactants = [r for reac_dict in bigg_reactions for r in reac_dict.get('reactants')] + bigg_products = [p for reac_dict in bigg_reactions for p in reac_dict.get('products')] + bigg_metabolites = list(set([*bigg_reactants, *bigg_products])) + + return (pd.DataFrame(biocyc_metabolites, columns=['Compound']), pd.DataFrame(bigg_metabolites, columns=['bigg_id'])) + + # @TODO: # more namespace options # Maybe bioregistry for mapping of id to namespace? @@ -508,109 +398,6 @@ def isreaction_complete(reac:cobra.Reaction, return True -# extending annotations -# --------------------- -# @TODO is this a good place for these functions or maybe somewhere else? - -# @TODO name of this function is not good -def get_BiGG_metabs_annotation_via_dbid(metabolite:cobra.Metabolite, - id:str, dbcol:str, - compartment:str = 'c') -> None: - """Search for a BiGG ID and add it to a metabolite annotation. - The search is based on a column name of the BiGG metabolite table - and an ID to search for. Additionally, using the given compartment name, - the found IDs are filtered for matching compartments. - - Args: - - metabolite (cobra.Metabolite): - The metabolite. Needs to a a COBRApy Metabolte object. - - id (str): - The ID to search for in the database. - - dbcol (str): - Name of the column of the database to check the ID against. - - compartment (str, optional): - The compartment name. Needs to be a valid BiGG compartment ID. - Defaults to 'c'. - """ - # check, if a BiGG annotation is given - if not 'bigg.metabolite' in metabolite.annotation.keys(): - # seach the database for the found id - bigg_search = load_a_table_from_database( - f'SELECT * FROM bigg_metabolites WHERE \'{dbcol}\' = \'{id}\'', - query=True) - if len(bigg_search) > 0: - # check, if the matches also match the compartment - metabolite.annotation['bigg.metabolite'] = [_ for _ in bigg_search['id'].tolist() if _.endswith(f'_{compartment}')] - # add final matches to the annotations of the metabolite - if len(metabolite.annotation['bigg.metabolite']) == 0: - metabolite.annotation.pop('bigg.metabolite') - - -def add_annotations_from_BiGG_metabs(metabolite:cobra.Metabolite) -> None: - """Check a cobra.metabolite for bigg.metabolite annotations. If they exists, - search for more annotations in the BiGG database and add them to the metabolite. - - Args: - - metabolite (cobra.Metabolite): - The metabolite object. - """ - if 'bigg.metabolite' in metabolite.annotation.keys(): - bigg_information = load_a_table_from_database( - 'SELECT * FROM bigg_metabolites WHERE id = \'' + f'\' OR id = \''.join(metabolite.annotation['bigg.metabolite']) + '\'', - query=True) - db_id_bigg = {'BioCyc':'biocyc', 'MetaNetX (MNX) Chemical':'metanetx.chemical','SEED Compound':'seed.compound','CHEBI':'chebi', 'KEGG Compound':'kegg.compound'} - for db in db_id_bigg: - info = list(set(bigg_information[db].dropna().to_list())) - if len(info) > 0: - info = ','.join(info) - info = [x.strip() for x in info.split(',')] # make sure all entries are a separate list object - if db_id_bigg[db] in metabolite.annotation.keys(): - metabolite.annotation[db_id_bigg[db]] = list(set(info + metabolite.annotation[db_id_bigg[db]])) - else: - metabolite.annotation[db_id_bigg[db]] = info - - -def _add_annotations_from_bigg_reac_row(row:pd.Series, reac:cobra.Reaction) -> None: - """Given a row of the BiGG reaction database table and a cobra.Reaction object, - extend the annotation of the latter with the information of the former. - - Args: - - row (pd.Series): - The row of the database table. - - reac (cobra.Reaction): - The reaction object. - """ - - dbnames = {'RHEA':'rhea','BioCyc':'biocyc','MetaNetX (MNX) Equation':'metanetx.reaction','EC Number':'ec-code'} - for dbname,dbprefix in dbnames.items(): - if row[dbname]: - ids_to_add = row[dbname].split(',') - if dbprefix in reac.annotation.keys(): - reac.annotation[dbprefix] = list(set(reac.annotation[dbprefix]).union(set(ids_to_add))) - else: - reac.annotation[dbprefix] = ids_to_add - - -def _add_annotations_from_dict_cobra(references:dict, entity:cobra.Reaction|cobra.Metabolite|cobra.Model) -> None: - """Given a dictionary and a cobra object, add the former as annotations to the latter. - The keys of the dictionary are used as the annotation labels, the values as the values. - If the keys are already in the entity, the values will be combined (union). - - Args: - - references (dict): - The dictionary with the references to add the entity. - - entity (cobra.Reaction | cobra.Metabolite | cobra.Model): - The entity to add annotations to. - """ - # add additional references from the parameter - for db,idlist in references.items(): - if not isinstance(idlist,list): - idlist = [idlist] - if db in entity.annotation.keys(): - entity.annotation[db] = list(set(entity.annotation[db] + idlist)) - else: - entity.annotation[db] = idlist - # adding metabolites to cobra models # ---------------------------------- @@ -754,7 +541,7 @@ def build_metabolite_mnx(id: str, model:cobra.Model, new_metabolite.annotation['bigg.metabolite'] = [_+'_'+compartment for _ in new_metabolite.annotation['bigg.metabolite']] else: # if no BiGG was found in MetaNetX, try reverse search in BiGG - get_BiGG_metabs_annotation_via_dbid(new_metabolite, id, 'MetaNetX (MNX) Chemical', compartment) + get_BiGG_metabs_annot_via_dbid(new_metabolite, id, 'MetaNetX (MNX) Chemical', compartment) # add additional information from BiGG (if ID found) add_annotations_from_BiGG_metabs(new_metabolite) @@ -924,7 +711,7 @@ def build_metabolite_kegg(kegg_id:str, model:cobra.Model, new_metabolite.annotation['bigg.metabolite'] = [_+'_'+compartment for _ in new_metabolite.annotation['bigg.metabolite']] # if no BiGG ID, try reverse search - get_BiGG_metabs_annotation_via_dbid(new_metabolite, id, 'KEGG Compound', compartment) + get_BiGG_metabs_annot_via_dbid(new_metabolite, id, 'KEGG Compound', compartment) # search for annotations in BiGG add_annotations_from_BiGG_metabs(new_metabolite) @@ -1080,20 +867,11 @@ def build_metabolite_bigg(id:str, model:cobra.Model, return model.metabolites.get_by_id(new_metabolite.id) return new_metabolite - - -@implement -def build_metabolite_biocyc(id:str, model:cobra.Model, - namespace:str, - compartment:str, - idprefix:str) -> cobra.Metabolite: - pass # adding reactions cobra models # ----------------------------- -# @TODO: BioCyc missing def parse_reac_str(equation:str, type:Literal['BiGG','BioCyc','MetaNetX','KEGG']='MetaNetX') -> tuple[dict,dict,list,bool]: """Parse a reaction string. @@ -1667,119 +1445,6 @@ def build_reaction_bigg(model:cobra.Model, id:str, return [new_reac.id] return new_reac - - -@implement -def build_reaction_biocyc(model:cobra.Model, id:str=None, - reac_str:str=None, - references:dict={}, - idprefix:str='refineGEMs', - namespace:Literal['BiGG']='BiGG') -> Union[cobra.Reaction, list, None]: - """Construct a new reaction for a model from a BiGG reaction ID. - This function will NOT add the reaction directly to the model, if the - construction process is successful. - - Args: - - model (cobra.Model): - The model loaded with COBRApy. - - id (str): - A BioCyc reaction ID. - - reac_str (str, optional): - The reaction equation string from the database. - Defaults to None. - - references (dict, optional): - Additional annotations to add to the reaction (idtype:[value]). - Defaults to {}. - - idprefix (str, optional): - Prefix for the pseudo-identifier. Defaults to 'refineGEMs'. - - namespace (Literal['BiGG'], optional): - Namespace to use for the reaction ID. - If namespace cannot be matched, uses the pseudo-ID - Defaults to 'BiGG'. - - Returns: - Case successful construction: - - cobra.Reaction: - The newly build reaction object. - - Case construction not possible: - - None: - Nothing to return. - - Case reaction found in model. - - list: - List of matching reaction IDs (in model). - """ - - # --------------------- - # check, if ID in model - # --------------------- - matches_found = [_.id for _ in model.reactions if 'bigg.reaction' in _.annotation.keys() and _.annotation['bigg.reaction']==id] - if len(matches_found) > 0: - return matches_found - - # ----------------------------- - # otherwise, build new reaction - # ----------------------------- - # create reaction object - new_reac = cobra.Reaction(create_random_id(model,'reac',idprefix)) - - # get information from the database - bigg_reac_info = load_a_table_from_database( - f'SELECT * FROM bigg_reactions WHERE id = \'{id}\'', - query=True).iloc[0,:] - new_reac.name = bigg_reac_info['name'] - - # add metabolites - # --------------- - reactants,products,comparts,rev = parse_reac_str(bigg_reac_info['reaction_string'],'BiGG') - - metabolites = {} - meta_counter = 0 - # reconstruct reactants - for mid,factor in reactants.items(): - tmp_meta = build_metabolite_bigg(mid,model, - namespace, - idprefix) - if tmp_meta: - metabolites[tmp_meta] = -1*factor - meta_counter += 1 - else: - return None # not able to build reaction successfully - - # reconstruct products - for mid,factor in products.items(): - tmp_meta = build_metabolite_bigg(mid,model, - namespace, - idprefix) - if tmp_meta: - metabolites[tmp_meta] = factor - meta_counter += 1 - else: - return None # not able to build reaction successfully - - # add metabolites to reaction - # @TODO: does it need some kind of try and error, if - for some highly unlikely reason - two newly generated ids are the same - new_reac.add_metabolites(metabolites) - - # set reversibility - if rev: - new_reac.bounds = (1000.0,1000.0) - else: - new_reac.bounds = (0.0,1000.0) - - # add annotations - # --------------- - # add SBOTerm - new_reac.annotation['sbo'] = 'SBO:0000167' - # add infos from BiGG - new_reac.annotation['bigg.reaction'] = [id] - _add_annotations_from_bigg_reac_row(bigg_reac_info, new_reac) - # add additional references from the parameter - _add @implement