Update #126

draeger-lab · Aug 21, 2024 · 21ada1a · 21ada1a
1 parent cd4e354
commit 21ada1a
Show file tree

Hide file tree

Showing 6 changed files with 29 additions and 27 deletions.
diff --git a/dev/gapfill-testing.ipynb b/dev/gapfill-testing.ipynb
@@ -1258,7 +1258,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,

diff --git a/pyproject.toml b/pyproject.toml
@@ -45,11 +45,11 @@ dependencies = [
     "gffutils>=0.10.1",
     "libchebipy>=1.0.10",
     "markupsafe>=2.0.1",
-    "masschargecuration@git+https://github.com/Biomathsys/MassChargeCuration@installation-fix", # @TODO wait for pull request
+    "masschargecuration@git+https://github.com/Biomathsys/MassChargeCuration@installation-fix", # @NOTE wait for pull request
     'matplotlib >= 3.8.2',
     "memote>=0.17.0", 
     "multiprocess>=0.70.16",
-    "numpy>=1.20.3,<2.0.0",  # @TODO wait for update
+    "numpy>=1.20.3,<2.0.0",  # @TODO wait for update? >2.0.0 not working properly
     "ols-client>=0.1.3",
     "pandas>=1.2.4",
     'pyyaml>=6.0.1',

diff --git a/src/refinegems/classes/gapfill.py b/src/refinegems/classes/gapfill.py
@@ -289,7 +289,10 @@ def _map_ec_to_reac_kegg(unmapped_reacs:pd.DataFrame) -> pd.DataFrame:
         """
 
         # get KEGG EC number information
-        kegg_mapped = pd.DataFrame.from_dict(list(unmapped_reacs.progress_apply(parse_KEGG_ec)))   
+        eccodes = unmapped_reacs['ec-code']
+        kegg_mapped = pd.DataFrame.from_dict(list(eccodes.progress_apply(parse_KEGG_ec)))
+        kegg_mapped = unmapped_reacs.merge(kegg_mapped, on='ec-code')
+
         kegg_mapped['is_transport'] = None
         kegg_mapped['via'] = kegg_mapped['id'].apply(lambda x: 'KEGG' if x else None)
         kegg_mapped.explode(column='id')
@@ -316,6 +319,7 @@ def _map_ec_to_reac_kegg(unmapped_reacs:pd.DataFrame) -> pd.DataFrame:
                 table = pd.concat([to_map,table[~table['id'].isna()]])
         else:
             table = _map_ec_to_reac_bigg(table)
+
 
     # map to KEGG
     if use_KEGG: 
@@ -368,7 +372,7 @@ def __init__(self) -> None:
         self.missing_reactions = None   # missing reacs, that have not yet been sorted into any category
 
         # general information
-        self.geneid_type = 'ncbi' # @TODO
+        self.geneid_type = 'ncbi' # @TODO more options?
 
         # collect stats & Co, can be extended by subclasses
         self._statistics = {  
@@ -680,6 +684,7 @@ def add_reactions_from_table(self, model:cobra.Model,
 
     # @TODO : logging
     # @TODO : save stuff for report / manual curation
+    # @DISCUSSION: Check for futile cycles after each addition of a new reaction?
     # @TEST - only tested in debugging mode with GeneGapFiller
     def fill_model(self, model:Union[cobra.Model,libModel], 
                    **kwargs) -> libModel:
@@ -749,7 +754,7 @@ def fill_model(self, model:Union[cobra.Model,libModel],
                 cobramodel = load_model(tmp.name,'cobra')
 
             # .......................
-            # @DEBUGGING
+            # @DEBUG
             # if len(self.missing_reacs) > 10:
             #     self.missing_reacs = self.missing_reacs.sample(10)
             #     print('fill_model: Running in debugging mode')
@@ -883,7 +888,7 @@ def get_model_genes(model: libModel) -> pd.DataFrame:
 
         # Step 5: map to EC via KEGG
         # --------------------------
-        # @DEBUGGING ...................
+        # @DEBUG .......................
         # genes_not_in_model = genes_not_in_model.iloc[330:350,:]
         # print(UserWarning('Running in debugging mode.'))
         # ..............................
@@ -936,7 +941,9 @@ def find_missing_reacs(self,model:cobra.Model):
 # ----------------------
 # Gapfilling with BioCyc
 # ----------------------
-
+# @TODO: Possibility to add KEGG Reaction & MetaNetX IDs to SmartTable 
+#       -> So far these are empty for my test cases
+#       -> Could be added in a future update
 class BioCycGapFiller(GapFiller):
     """
     | @TODO: Write correct doc string
@@ -1348,7 +1355,7 @@ def find_missing_reacs(self, model:cobra.Model,
         # Case 2: still no EC but ncbiprotein
         # -----------------------------------
         #       -> access ncbi for ec (optional) 
-        # @DEBUGGING ...................
+        # @DEBUG .......................
         # mapped_reacs = mapped_reacs.iloc[300:350,:]
         # print(UserWarning('Running in debugging mode.'))
         # ..............................
@@ -1412,7 +1419,6 @@ def find_missing_reacs(self, model:cobra.Model,
 # For filtering
 ############################################################################
 # Evtl hier:
-
 # Inspired by Dr. Reihaneh Mostolizadeh's function to add BioCyc reactions to a model
 def replace_reaction_direction_with_fluxes(missing_reacs: pd.DataFrame) -> pd.DataFrame:
    """Extracts the flux lower & upper bounds for each reaction through the entries in column 'Reaction-Direction'

diff --git a/src/refinegems/curation/curate.py b/src/refinegems/curation/curate.py
@@ -323,7 +323,7 @@ def resolve_duplicate_reactions(model:cobra.Model, based_on:str='reaction', remo
                                     keep_reac.gene_reaction_rule = gpr_to_add # add the one existing the other 
                             else:
                                 pass # nothing to add
-                            model.reactions.get_by_id(r_id).delete()
+                            model.reactions.get_by_id(r_id).remove_from_model() # @NOTE throws warning, but seems to be a cobra-internal issue
                             print(F'\tDuplicate reaction {r_id} found. Combined to {keep_reac.id} and deleted.')
                     else:
                         print(f'\tDuplicate reactions {", ".join(mnx[1]["id"].tolist())} found.')

diff --git a/src/refinegems/curation/polish.py b/src/refinegems/curation/polish.py
@@ -1469,11 +1469,15 @@ def check_direction(model:cobra.Model,data:Union[pd.DataFrame,str]) -> cobra.Mod
         if not pd.isnull(direction):
             if 'REVERSIBLE' in direction:
                 # set reaction as reversible by setting default values for upper and lower bounds
-                r.lower_bound = -1000.
+                r.lower_bound = cobra.Configuration().lower_bound
             elif 'RIGHT-TO-LEFT' in direction:
                 # invert the default values for the boundaries
-                r.lower_bound = -1000.
-                r.upper_bound = 0.
+                r.lower_bound = cobra.Configuration().lower_bound
+                r.upper_bound = 0.0
+            elif 'LEFT-To-RIGHT' in direction:
+                # In case direction was already wrong
+                r.lower_bound = 0.0
+                r.upper_bound = cobra.Configuration().upper_bound
             else:
                 # left to right case is the standart for adding reactions
                 # = nothing left to do

diff --git a/src/refinegems/utility/entities.py b/src/refinegems/utility/entities.py
@@ -1111,9 +1111,9 @@ def build_reaction_mnx(model:cobra.Model, id:str,
 
     # set reversibility
     if rev:
-        new_reac.bounds = (1000.0,1000.0)
+        new_reac.bounds = cobra.Configuration().bounds
     else:
-        new_reac.bounds = (0.0,1000.0)
+        new_reac.bounds = (0.0,cobra.Configuration().upper_bound)
 
     # get annotations
     # ---------------
@@ -1125,7 +1125,7 @@ def build_reaction_mnx(model:cobra.Model, id:str,
             new_reac.annotation[db] = [m.split(':',1)[1] for m in db_matches['source'].tolist()]
             # update reactions direction, if MetaCyc has better information
             if db == 'metacyc.reaction' and len(db_matches[db_matches['source'].str.contains('-->')]):
-                new_reac.bounds = (0.0,1000.0)
+                new_reac.bounds = (0.0,cobra.Configuration().upper_bound)
     # add additional references from the parameter
     _add_annotations_from_dict_cobra(references,new_reac)
 
@@ -1223,11 +1223,12 @@ def build_reaction_kegg(model:cobra.Model, id:str=None, reac_str:str=None,
     # Reaction reconstruction from equation
     # -------------------------------------
 
-    # skip, if not reaction string is available
+    # skip, if no reaction string is available
     if not reac_str:
         return None # reconstruction not possible
 
     # parse reaction string
+    # @TODO filter out not useable reaction strings
     reactants,products,comparts,rev = parse_reac_str(reac_str,'KEGG')
 
     # ..............................................
@@ -1356,15 +1357,6 @@ def build_reaction_bigg(model:cobra.Model, id:str,
             list: 
                 List of matching reaction IDs (in model).
     """
-    # @TODO: 
-    # 1. If no Reaction-Direction is provided set direction to reversible!
-    # 2. No compartment provided -> Set compartment to cytosol
-    # 3. Set charge to zero?
-    # 4. Get information on formula with requests? biocyc api seems not helpful!
-    # 5. In Reihaneh's/Finn Mier's code in py4gems all reactants have stoichiometry -1 and all products 1...
-    #   -> Found example that stoichiometry in equation! Split at spaces to get?
-    # @DISCUSSION: Still no clue on how to map the names in the equations to actual biocyc compound IDs...
-    # @DISCUSSION: Check for futile cycles after each addition of a new reaction?
 
     # ---------------------
     # check, if ID in model