Skip to content

Commit

Permalink
Merge pull request #61 from draeger-lab/gapfill-update
Browse files Browse the repository at this point in the history
All checks completed, will be merged into dev.
  • Loading branch information
famosab authored Feb 10, 2023
2 parents b01972b + f296b6f commit e23e890
Show file tree
Hide file tree
Showing 12 changed files with 16,745 additions and 261 deletions.
28 changes: 22 additions & 6 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,28 @@ memote: FALSE
output: xlsx #cl, xlsx, csv

# these work only with single models
### Gene comparison ###
genecomp: FALSE # set to False if not needed, only works when gprs are annotated with kegg
# the following is only relevant when turned on
organismid: 'T05059' # C. striatum
gff_file: 'data/cstr.gff' # C. striatum
biggreactions: 'data/bigg_models_reactions.txt'
### Gapfill ###
# All parameters are required for all db_to_compare choices except:
# - organismid which is only required for db_to_compare: 'KEGG'/'KEGG+BioCyc'
# - and biocyc_tables which is not required for 'KEGG'
gapfill_analysis: TRUE
gapfill_analysis_params:
db_to_compare: 'KEGG' # One of the choices KEGG|BioCyc|GFF|KEGG+BioCyc
organismid: 'T05059' # Needs to be specified for KEGG
bigg_dbs:
- 'data/bigg_models_reactions.txt' # Path to BiGG reactions database
- 'data/bigg_models_metabolites.txt' # Path to BiGG metabolites database
gff_file: 'data/cstr.gff' # Path to RefSeq GFF file
biocyc_files:
- 'Path0' # Path to TXT file containing a SmartTable from BioCyc with the columns 'Accession-2' 'Reaction of gene' (-)
- 'Path1' # Path to TXT file containing a SmartTable with all reaction relevant information (*)
- 'Path2' # Path to TXT file containing a SmartTable with all metabolite relevant information (+)
- 'Path3' # Path to protein FASTA file used as input for CarveMe (Needed to get the protein IDs from the locus tags)
# (-) If the organism is not in BioCyc retrieve a table mapping all reactions in BioCyc to the corresponding sequence
# (*) 'Reaction' 'Reactants of reaction' 'Products of reaction' 'EC-Number' 'Reaction-Direction' 'Spontaneous?'
# (+) 'Compound' 'Object ID' 'Chemical Formula' 'InChI-Key' 'ChEBI'
# For all BioCyc files the order should be the same. If the organism does not occur in the BioCyc database the
# complete tables for reactions can be used with the same columns.

### ModelSEED comparison ###
modelseed: FALSE # set to False if not needed
Expand Down
15,725 changes: 15,725 additions & 0 deletions data/bigg_models_metabolites.txt

Large diffs are not rendered by default.

51 changes: 43 additions & 8 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pandas as pd
from datetime import date

__author__ = "Famke Baeuerle"
__author__ = "Famke Baeuerle and Gwendolyn O. Gusak"

def main():
"""main function to run the program"""
Expand Down Expand Up @@ -79,8 +79,8 @@ def main():
if (config['memote']):
score = rg.investigate.run_memote(model_cobra)

if (config['genecomp']):
genecomp = rg.genecomp.kegg_gene_comp(model_libsbml, config['organismid'], config['biggreactions'], config['gff_file'])
if (config['gapfill_analysis']):
gapfill_analysis = rg.gapfill.gapfill_analysis(model_libsbml, config['gapfill_analysis_params'])

if(config['modelseed']):
charge_mismatch, formula_mismatch = rg.modelseed.compare_to_modelseed(config['modelseedpath'], model_cobra)
Expand All @@ -102,7 +102,24 @@ def main():
print('Charge unbalanced reactions: ' + str(charge_unbal))
print(growth_sim)
print(egc)
if(config['genecomp']): print(genecomp)
if(config['gapfill_analysis']):
if type(gapfill_analysis) == tuple:
print('BioCyc - Statistics on missing entities:')
print(gapfill_analysis[0])
print('BioCyc - Missing genes for reactions table:')
print(gapfill_analysis[1])
print('BioCyc - Missing metabolites with BiGG ID table:')
print(gapfill_analysis[2])
print('BioCyc - Missing metabolites without BiGG ID table:')
print(gapfill_analysis[3])
print('BioCyc - Missing reactions with BiGG ID table:')
print(gapfill_analysis[4])
if len(gapfill_analysis) == 6:
print('KEGG - Missing reactions with BiGG ID table:')
print(gapfill_analysis[6])
else:
print('KEGG - Gap fill analysis')
print(gapfill_analysis)
if(config['modelseed']):
print(charge_mismatch)
print(formula_mismatch)
Expand All @@ -118,11 +135,20 @@ def main():
model_params.to_excel(writer, sheet_name='model params', index=False)
growth_sim.to_excel(writer, sheet_name='growth simulation', index=False)
egc.to_excel(writer, sheet_name='EGC test', index=False)
if(config['genecomp']):
genecomp.to_excel(writer, sheet_name='gene comparison', index=False)
if(config['gapfill_analysis']) and type(gapfill_analysis) != tuple:
gapfill_analysis.to_excel(writer, sheet_name='KEGG gap fill analysis', index=False)
if(config['modelseed']):
charge_mismatch.to_excel(writer, sheet_name='charge mismatches', index=False)
formula_mismatch.to_excel(writer, sheet_name='formula mismatches', index=False)
if(config['gapfill_analysis']) and type(gapfill_analysis) == tuple:
with pd.ExcelWriter(config['out_path'] + name + '_gapfill_analysis_' + str(today) + '.xlsx') as writer:
gapfill_analysis[0].to_excel(writer, sheet_name='gap fill statistics', index=False)
gapfill_analysis[1].to_excel(writer, sheet_name='genes', index=False)
gapfill_analysis[2].to_excel(writer, sheet_name='metabolites', index=False)
gapfill_analysis[3].to_excel(writer, sheet_name='metabolites without BiGG IDs', index=False)
gapfill_analysis[4].to_excel(writer, sheet_name='reactions', index=False)
if len(gapfill_analysis) == 6:
gapfill_analysis[5].to_excel(writer, sheet_name='KEGG reactions', index=False)

if (config['output'] == 'csv'): # csv file
print('---')
Expand All @@ -135,8 +161,17 @@ def main():
model_info.to_csv(name + '_modelinfo.csv', index=False)
growth_sim.to_csv(name +'_growthsim.csv', index=False)
egc.to_csv(name + '_egc.csv', index=False)
if(config['genecomp']):
genecomp.to_csv(name +'_genecomp.csv', index=False)
if(config['gapfill_analysis']):
if type(gapfill_analysis) == tuple:
gapfill_analysis[0].to_csv(name +'_BioCyc_analysis_statistics.csv', index=False)
gapfill_analysis[1].to_csv(name +'_BioCyc_analysis_genes.csv', index=False)
gapfill_analysis[2].to_csv(name +'_BioCyc_analysis_metabolites.csv', index=False)
gapfill_analysis[3].to_csv(name +'_BioCyc_analysis_metabolites_wo_BiGG.csv', index=False)
gapfill_analysis[4].to_csv(name +'_BioCyc_analysis_reactions.csv', index=False)
if len(gapfill_analysis) == 6:
gapfill_analysis[5].to_csv(name +'_KEGG_gapfill_analysis.csv', index=False)
else:
gapfill.to_csv(name +'_KEGG_gapfill_analysis.csv', index=False)
if(config['modelseed']):
charge_mismatch.to_csv(name + '_charge_mismatch.csv', index=False)
formula_mismatch.to_csv(name + '_formula_mismatch.csv', index=False)
Expand Down
7 changes: 6 additions & 1 deletion refinegems/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,13 @@
import refinegems.pathways
import refinegems.curate
import refinegems.polish
import refinegems.parse
import refinegems.cvterms
import refinegems.genecomp
import refinegems.analysis_db
import refinegems.analysis_biocyc
import refinegems.analysis_kegg
import refinegems.entities
import refinegems.gapfill
import refinegems.comparison
import refinegems.investigate
import refinegems.load
Expand Down
Loading

0 comments on commit e23e890

Please sign in to comment.