Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gapfill update #61

Merged
merged 23 commits into from
Feb 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
7a75107
Created analysis_biocyc.py #52
GwennyGit Feb 2, 2023
9c4d9f5
Created entities.py #52
GwennyGit Feb 2, 2023
875e4eb
Created gapfill.py #52
GwennyGit Feb 2, 2023
96eda8f
Renamed genecomp to analysis_kegg.py #52
GwennyGit Feb 2, 2023
0326be3
Updated __init__.py due to module changes #52
GwennyGit Feb 2, 2023
0c58e02
Adjusted code in curate #52
GwennyGit Feb 2, 2023
810f9d9
Adjusted code in analysis_kegg #52
GwennyGit Feb 2, 2023
1f95746
Added code for function gapfill_analysis #52
GwennyGit Feb 8, 2023
890f2fa
Updated analysis_biocyc #52
GwennyGit Feb 8, 2023
e94fe4d
Updated analysis_kegg due to refactoring #52
GwennyGit Feb 8, 2023
e947d42
Updated entities due to refactoring #52
GwennyGit Feb 8, 2023
8eeb1ed
Added InChI-Key to metabol_db_dict in cvterms #52 #59
GwennyGit Feb 8, 2023
624131b
Created analysis_db due to refactoring #52
GwennyGit Feb 8, 2023
78c4fba
Created parse due to refactoring #52
GwennyGit Feb 8, 2023
8f65d0d
Updated __init__ due to new modules #52
GwennyGit Feb 8, 2023
fac0df6
Updated main due to new function gapfill_analysis #52
GwennyGit Feb 8, 2023
f7238e7
Added bigg_models_metabolites.txt for gapfill #52
GwennyGit Feb 8, 2023
0b5c6f4
Updated config due to new function gapfill_analysis #52
GwennyGit Feb 8, 2023
5bbac70
Fixed problems in analysis_biocyc #52
GwennyGit Feb 9, 2023
58f015c
Improved BiGG ID to other ID mapping #52
GwennyGit Feb 9, 2023
3f63033
Changed create_gp as preparation of analysis_biocyc #52
GwennyGit Feb 9, 2023
7b32a19
Added column name to parse_fasta_headers result #52
GwennyGit Feb 9, 2023
f296b6f
Removed test outpath from main #52
GwennyGit Feb 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,28 @@ memote: FALSE
output: xlsx #cl, xlsx, csv

# these work only with single models
### Gene comparison ###
genecomp: FALSE # set to False if not needed, only works when gprs are annotated with kegg
# the following is only relevant when turned on
organismid: 'T05059' # C. striatum
gff_file: 'data/cstr.gff' # C. striatum
biggreactions: 'data/bigg_models_reactions.txt'
### Gapfill ###
# All parameters are required for all db_to_compare choices except:
# - organismid which is only required for db_to_compare: 'KEGG'/'KEGG+BioCyc'
# - and biocyc_tables which is not required for 'KEGG'
gapfill_analysis: TRUE
gapfill_analysis_params:
db_to_compare: 'KEGG' # One of the choices KEGG|BioCyc|GFF|KEGG+BioCyc
organismid: 'T05059' # Needs to be specified for KEGG
bigg_dbs:
- 'data/bigg_models_reactions.txt' # Path to BiGG reactions database
- 'data/bigg_models_metabolites.txt' # Path to BiGG metabolites database
gff_file: 'data/cstr.gff' # Path to RefSeq GFF file
biocyc_files:
- 'Path0' # Path to TXT file containing a SmartTable from BioCyc with the columns 'Accession-2' 'Reaction of gene' (-)
- 'Path1' # Path to TXT file containing a SmartTable with all reaction relevant information (*)
- 'Path2' # Path to TXT file containing a SmartTable with all metabolite relevant information (+)
- 'Path3' # Path to protein FASTA file used as input for CarveMe (Needed to get the protein IDs from the locus tags)
# (-) If the organism is not in BioCyc retrieve a table mapping all reactions in BioCyc to the corresponding sequence
# (*) 'Reaction' 'Reactants of reaction' 'Products of reaction' 'EC-Number' 'Reaction-Direction' 'Spontaneous?'
# (+) 'Compound' 'Object ID' 'Chemical Formula' 'InChI-Key' 'ChEBI'
# For all BioCyc files the order should be the same. If the organism does not occur in the BioCyc database the
# complete tables for reactions can be used with the same columns.

### ModelSEED comparison ###
modelseed: FALSE # set to False if not needed
Expand Down
15,725 changes: 15,725 additions & 0 deletions data/bigg_models_metabolites.txt

Large diffs are not rendered by default.

51 changes: 43 additions & 8 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pandas as pd
from datetime import date

__author__ = "Famke Baeuerle"
__author__ = "Famke Baeuerle and Gwendolyn O. Gusak"

def main():
"""main function to run the program"""
Expand Down Expand Up @@ -79,8 +79,8 @@ def main():
if (config['memote']):
score = rg.investigate.run_memote(model_cobra)

if (config['genecomp']):
genecomp = rg.genecomp.kegg_gene_comp(model_libsbml, config['organismid'], config['biggreactions'], config['gff_file'])
if (config['gapfill_analysis']):
gapfill_analysis = rg.gapfill.gapfill_analysis(model_libsbml, config['gapfill_analysis_params'])

if(config['modelseed']):
charge_mismatch, formula_mismatch = rg.modelseed.compare_to_modelseed(config['modelseedpath'], model_cobra)
Expand All @@ -102,7 +102,24 @@ def main():
print('Charge unbalanced reactions: ' + str(charge_unbal))
print(growth_sim)
print(egc)
if(config['genecomp']): print(genecomp)
if(config['gapfill_analysis']):
if type(gapfill_analysis) == tuple:
print('BioCyc - Statistics on missing entities:')
print(gapfill_analysis[0])
print('BioCyc - Missing genes for reactions table:')
print(gapfill_analysis[1])
print('BioCyc - Missing metabolites with BiGG ID table:')
print(gapfill_analysis[2])
print('BioCyc - Missing metabolites without BiGG ID table:')
print(gapfill_analysis[3])
print('BioCyc - Missing reactions with BiGG ID table:')
print(gapfill_analysis[4])
if len(gapfill_analysis) == 6:
print('KEGG - Missing reactions with BiGG ID table:')
print(gapfill_analysis[6])
else:
print('KEGG - Gap fill analysis')
print(gapfill_analysis)
if(config['modelseed']):
print(charge_mismatch)
print(formula_mismatch)
Expand All @@ -118,11 +135,20 @@ def main():
model_params.to_excel(writer, sheet_name='model params', index=False)
growth_sim.to_excel(writer, sheet_name='growth simulation', index=False)
egc.to_excel(writer, sheet_name='EGC test', index=False)
if(config['genecomp']):
genecomp.to_excel(writer, sheet_name='gene comparison', index=False)
if(config['gapfill_analysis']) and type(gapfill_analysis) != tuple:
gapfill_analysis.to_excel(writer, sheet_name='KEGG gap fill analysis', index=False)
if(config['modelseed']):
charge_mismatch.to_excel(writer, sheet_name='charge mismatches', index=False)
formula_mismatch.to_excel(writer, sheet_name='formula mismatches', index=False)
if(config['gapfill_analysis']) and type(gapfill_analysis) == tuple:
with pd.ExcelWriter(config['out_path'] + name + '_gapfill_analysis_' + str(today) + '.xlsx') as writer:
gapfill_analysis[0].to_excel(writer, sheet_name='gap fill statistics', index=False)
gapfill_analysis[1].to_excel(writer, sheet_name='genes', index=False)
gapfill_analysis[2].to_excel(writer, sheet_name='metabolites', index=False)
gapfill_analysis[3].to_excel(writer, sheet_name='metabolites without BiGG IDs', index=False)
gapfill_analysis[4].to_excel(writer, sheet_name='reactions', index=False)
if len(gapfill_analysis) == 6:
gapfill_analysis[5].to_excel(writer, sheet_name='KEGG reactions', index=False)

if (config['output'] == 'csv'): # csv file
print('---')
Expand All @@ -135,8 +161,17 @@ def main():
model_info.to_csv(name + '_modelinfo.csv', index=False)
growth_sim.to_csv(name +'_growthsim.csv', index=False)
egc.to_csv(name + '_egc.csv', index=False)
if(config['genecomp']):
genecomp.to_csv(name +'_genecomp.csv', index=False)
if(config['gapfill_analysis']):
if type(gapfill_analysis) == tuple:
gapfill_analysis[0].to_csv(name +'_BioCyc_analysis_statistics.csv', index=False)
gapfill_analysis[1].to_csv(name +'_BioCyc_analysis_genes.csv', index=False)
gapfill_analysis[2].to_csv(name +'_BioCyc_analysis_metabolites.csv', index=False)
gapfill_analysis[3].to_csv(name +'_BioCyc_analysis_metabolites_wo_BiGG.csv', index=False)
gapfill_analysis[4].to_csv(name +'_BioCyc_analysis_reactions.csv', index=False)
if len(gapfill_analysis) == 6:
gapfill_analysis[5].to_csv(name +'_KEGG_gapfill_analysis.csv', index=False)
else:
gapfill.to_csv(name +'_KEGG_gapfill_analysis.csv', index=False)
if(config['modelseed']):
charge_mismatch.to_csv(name + '_charge_mismatch.csv', index=False)
formula_mismatch.to_csv(name + '_formula_mismatch.csv', index=False)
Expand Down
7 changes: 6 additions & 1 deletion refinegems/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,13 @@
import refinegems.pathways
import refinegems.curate
import refinegems.polish
import refinegems.parse
import refinegems.cvterms
import refinegems.genecomp
import refinegems.analysis_db
import refinegems.analysis_biocyc
import refinegems.analysis_kegg
import refinegems.entities
import refinegems.gapfill
import refinegems.comparison
import refinegems.investigate
import refinegems.load
Expand Down
Loading