Merge pull request #61 from draeger-lab/gapfill-update

All checks completed, will be merged into dev.
draeger-lab · Feb 10, 2023 · e23e890 · e23e890
2 parents b01972b + f296b6f
commit e23e890
Show file tree

Hide file tree

Showing 12 changed files with 16,745 additions and 261 deletions.
diff --git a/config.yaml b/config.yaml
@@ -69,12 +69,28 @@ memote: FALSE
 output: xlsx #cl, xlsx, csv 
 
 # these work only with single models
-### Gene comparison ###
-genecomp: FALSE # set to False if not needed, only works when gprs are annotated with kegg
-# the following is only relevant when turned on
-organismid: 'T05059' # C. striatum
-gff_file: 'data/cstr.gff' # C. striatum
-biggreactions: 'data/bigg_models_reactions.txt'
+### Gapfill ###
+# All parameters are required for all db_to_compare choices except:
+# - organismid which is only required for db_to_compare: 'KEGG'/'KEGG+BioCyc'
+# - and biocyc_tables which is not required for 'KEGG'
+gapfill_analysis: TRUE
+gapfill_analysis_params:
+  db_to_compare: 'KEGG'  # One of the choices KEGG|BioCyc|GFF|KEGG+BioCyc
+  organismid: 'T05059'  # Needs to be specified for KEGG
+  bigg_dbs:
+    - 'data/bigg_models_reactions.txt'  # Path to BiGG reactions database
+    - 'data/bigg_models_metabolites.txt'  # Path to BiGG metabolites database
+  gff_file: 'data/cstr.gff'  # Path to RefSeq GFF file 
+  biocyc_files: 
+    - 'Path0'  # Path to TXT file containing a SmartTable from BioCyc with the columns 'Accession-2' 'Reaction of gene' (-)
+    - 'Path1'  # Path to TXT file containing a SmartTable with all reaction relevant information (*)
+    - 'Path2'  # Path to TXT file containing a SmartTable with all metabolite relevant information (+)
+    - 'Path3'  # Path to protein FASTA file used as input for CarveMe (Needed to get the protein IDs from the locus tags)
+# (-) If the organism is not in BioCyc retrieve a table mapping all reactions in BioCyc to the corresponding sequence
+# (*) 'Reaction' 'Reactants of reaction' 'Products of reaction' 'EC-Number' 'Reaction-Direction' 'Spontaneous?'
+# (+) 'Compound' 'Object ID' 'Chemical Formula' 'InChI-Key' 'ChEBI'
+# For all BioCyc files the order should be the same. If the organism does not occur in the BioCyc database the
+# complete tables for reactions can be used with the same columns.
 
 ### ModelSEED comparison ###
 modelseed: FALSE # set to False if not needed

diff --git a/data/bigg_models_metabolites.txt b/data/bigg_models_metabolites.txt
diff --git a/main.py b/main.py
@@ -6,7 +6,7 @@
 import pandas as pd
 from datetime import date
 
-__author__ = "Famke Baeuerle"
+__author__ = "Famke Baeuerle and Gwendolyn O. Gusak"
 
 def main():
     """main function to run the program"""
@@ -79,8 +79,8 @@ def main():
             if (config['memote']):
                 score = rg.investigate.run_memote(model_cobra)
 
-            if (config['genecomp']):
-                genecomp = rg.genecomp.kegg_gene_comp(model_libsbml, config['organismid'], config['biggreactions'], config['gff_file'])
+            if (config['gapfill_analysis']):
+                gapfill_analysis = rg.gapfill.gapfill_analysis(model_libsbml, config['gapfill_analysis_params'])
 
             if(config['modelseed']):
                 charge_mismatch, formula_mismatch = rg.modelseed.compare_to_modelseed(config['modelseedpath'], model_cobra)
@@ -102,7 +102,24 @@ def main():
                 print('Charge unbalanced reactions: ' + str(charge_unbal))
                 print(growth_sim)
                 print(egc)
-                if(config['genecomp']): print(genecomp)
+                if(config['gapfill_analysis']): 
+                    if type(gapfill_analysis) == tuple:
+                        print('BioCyc - Statistics on missing entities:')
+                        print(gapfill_analysis[0])
+                        print('BioCyc - Missing genes for reactions table:')
+                        print(gapfill_analysis[1])
+                        print('BioCyc - Missing metabolites with BiGG ID table:')
+                        print(gapfill_analysis[2])
+                        print('BioCyc - Missing metabolites without BiGG ID table:')
+                        print(gapfill_analysis[3])
+                        print('BioCyc - Missing reactions with BiGG ID table:')
+                        print(gapfill_analysis[4])
+                        if len(gapfill_analysis) == 6:
+                            print('KEGG - Missing reactions with BiGG ID table:')
+                            print(gapfill_analysis[6])
+                    else:
+                        print('KEGG - Gap fill analysis')
+                        print(gapfill_analysis)
                 if(config['modelseed']):
                     print(charge_mismatch)
                     print(formula_mismatch)
@@ -118,11 +135,20 @@ def main():
                     model_params.to_excel(writer, sheet_name='model params', index=False)
                     growth_sim.to_excel(writer, sheet_name='growth simulation', index=False)
                     egc.to_excel(writer, sheet_name='EGC test', index=False)
-                    if(config['genecomp']):
-                        genecomp.to_excel(writer, sheet_name='gene comparison', index=False)
+                    if(config['gapfill_analysis']) and type(gapfill_analysis) != tuple:
+                        gapfill_analysis.to_excel(writer, sheet_name='KEGG gap fill analysis', index=False)
                     if(config['modelseed']):
                         charge_mismatch.to_excel(writer, sheet_name='charge mismatches', index=False)
                         formula_mismatch.to_excel(writer, sheet_name='formula mismatches', index=False)
+                if(config['gapfill_analysis']) and type(gapfill_analysis) == tuple: 
+                    with pd.ExcelWriter(config['out_path'] + name + '_gapfill_analysis_' + str(today) + '.xlsx') as writer:
+                        gapfill_analysis[0].to_excel(writer, sheet_name='gap fill statistics', index=False)
+                        gapfill_analysis[1].to_excel(writer, sheet_name='genes', index=False)
+                        gapfill_analysis[2].to_excel(writer, sheet_name='metabolites', index=False)
+                        gapfill_analysis[3].to_excel(writer, sheet_name='metabolites without BiGG IDs', index=False)
+                        gapfill_analysis[4].to_excel(writer, sheet_name='reactions', index=False)
+                        if len(gapfill_analysis) == 6:
+                            gapfill_analysis[5].to_excel(writer, sheet_name='KEGG reactions', index=False)
 
             if (config['output'] == 'csv'): # csv file
                 print('---')
@@ -135,8 +161,17 @@ def main():
                 model_info.to_csv(name + '_modelinfo.csv', index=False)
                 growth_sim.to_csv(name +'_growthsim.csv', index=False)
                 egc.to_csv(name + '_egc.csv', index=False)
-                if(config['genecomp']):
-                    genecomp.to_csv(name +'_genecomp.csv', index=False)
+                if(config['gapfill_analysis']):
+                    if type(gapfill_analysis) == tuple:
+                        gapfill_analysis[0].to_csv(name +'_BioCyc_analysis_statistics.csv', index=False)
+                        gapfill_analysis[1].to_csv(name +'_BioCyc_analysis_genes.csv', index=False)
+                        gapfill_analysis[2].to_csv(name +'_BioCyc_analysis_metabolites.csv', index=False)
+                        gapfill_analysis[3].to_csv(name +'_BioCyc_analysis_metabolites_wo_BiGG.csv', index=False)
+                        gapfill_analysis[4].to_csv(name +'_BioCyc_analysis_reactions.csv', index=False) 
+                        if len(gapfill_analysis) == 6:
+                            gapfill_analysis[5].to_csv(name +'_KEGG_gapfill_analysis.csv', index=False) 
+                    else:
+                        gapfill.to_csv(name +'_KEGG_gapfill_analysis.csv', index=False)
                 if(config['modelseed']):
                     charge_mismatch.to_csv(name + '_charge_mismatch.csv', index=False)
                     formula_mismatch.to_csv(name + '_formula_mismatch.csv', index=False)

diff --git a/refinegems/__init__.py b/refinegems/__init__.py
@@ -7,8 +7,13 @@
 import refinegems.pathways
 import refinegems.curate
 import refinegems.polish
+import refinegems.parse
 import refinegems.cvterms
-import refinegems.genecomp
+import refinegems.analysis_db
+import refinegems.analysis_biocyc
+import refinegems.analysis_kegg
+import refinegems.entities
+import refinegems.gapfill
 import refinegems.comparison
 import refinegems.investigate
 import refinegems.load