This repository has been archived by the owner on Sep 24, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessReactome.py
executable file
·281 lines (213 loc) · 10.3 KB
/
processReactome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Usage: program.py <customer>
program.py <customer>
program.py (-h | --help)
program.py --version
Options:
-h --help Show help.
--version Show version.
"""
# TODO
#
# Ty found these errors
# Group-20 and Group-30 are identical, including rxnId. Another pair is Group-1383 and Group-1417. In total there are 80 pairs with identical rxnIds, and three sets of three identical rxnIds. I spot checked a few, all the info (SET, bel statements, etc) were the same within a pair that have the same rxnId.
# WSH Should be fixed by creating a set from ReactionList- Looks like the human, mouse and rat share rxnIds. I’ll fix that before the next drop.
# There are cases of identical “=>” relationships that are not due to identical rxnIds. I look into one of them (Group-65 and Group-66). In this case, the evidence lines for these groups are the same except for GYS1-a is in one and GYS1-b is in the other. A google search tells me that a is the unphosphorylated form, and b is the phosphorylated form. Everything other than the evidence lines for these groups are the same, which seems like a reasonable interpretation. I looked up a few more of these cases and they all were cases where there were separate reactions for the phosphorylated and unphosphorylated forms of an enzyme catalyzing the same reaction.
# WSH - Not sure how to handle this. Need a more sophisticated parse of Reactome at the very least to capture these subtleties.
# There is some weird nesting of the location with complexes. See Group-10. Essentially, there are complexes with loc() of this form:
# complex(complex(p(x,loc(Y)), loc(y)), loc(y)), with a single hasComponent statement showing that it is composed of complex(p(x,loc(Y)), loc(y)). I’m still working out exactly how we will be using the hasComponent statements – right now I don’t foresee needing this to be fixed, but I may run into scenarios where this structure causes problems.
# WSH - The highlighted complex presentation is a bug – I’ll fix that too. I de-dup the dimers, but didn’t sort out the locations at the same time.
import os
import copy
import time
import re
from jinja2 import Environment, FileSystemLoader
import click
from toBel import toBel, dedup, dedupList, escapeBelString, setBelVersion
from reactome_webservice import getEntityData, getReactions
# Overwrite logs on each run -> filemode = 'w'
import log_setup
log = log_setup.getLogger(name='root')
# log.debug('This message should go to the log file')
# log.info('So should this')
# log.warning('And this, too')
# Jinja template
PATH = os.path.dirname(os.path.abspath(__file__))
TEMPLATE_ENVIRONMENT = Environment(
autoescape=False,
loader=FileSystemLoader(os.path.join(PATH, 'templates')),
trim_blocks=False)
template_filename = 'belscript.jinja2'
speciesList = ['Homo sapiens', 'Mus musculus', 'Rattus norvegicus']
def convertSpeciesNameToTaxId(name):
species = {
'Homo sapiens': 9606,
'Mus musculus': 10090,
'Rattus norvegicus': 10116,
}
return species[name]
def render_template(template_filename, evidences, pathways=None):
context = {}
# Todo add the following to a configuration file and automate the date
if pathways:
setname = '{} Pathway Reactome Reactions'.format(' and '.join(pathways))
else:
setname = 'All Reactome Reactions'
context['BEL_DOCUMENT_NAME'] = setname
context['BEL_DOCUMENT_DESCRIPTION'] = '{} converted to BEL 1.0'.format(setname)
context['AUTHORS'] = 'Selventa; Nimisha Schneider; Natalie Catlett; William Hayes'
context['DATE'] = time.strftime("%Y-%m-%d")
context['VERSION'] = '0.1'
context['CONTACT_EMAIL'] = '[email protected]'
context['evidences'] = evidences
return TEMPLATE_ENVIRONMENT.get_template(template_filename).render(context)
def buildStatements(catalysts, inputs, outputs):
input_bel = ', '.join([bel for input in inputs for bel in input])
output_bel = ', '.join([bel for output in outputs for bel in output])
rxn = 'rxn(reactants({}), products({}))'.format(input_bel, output_bel)
statements = []
if catalysts:
for catalyst in catalysts:
if catalyst:
for bel in catalyst:
statement = 'cat({}) => {}'.format(bel, rxn)
statements.append(statement)
else:
log.error('No BEL in catalyst: {}'.format(catalysts))
if not statements:
statements.append(rxn)
for results in (catalysts, inputs, outputs):
for result in results:
if result:
for bel in result:
for statement in result[bel]:
statements.append(statement)
else:
log.error('No BEL in result: {}'.format(results))
return statements
def buildBelEvidences(reactionList, belversion, pathways=None):
''' Load reactions and build BEL Evidences'''
rxnUrlTpl = 'http://www.reactome.org/PathwayBrowser/#'
evidences = []
bad_namespaces_evidences = [] # collect bad evidences
# indexCnt = 0
for rxnId, rxnName in reactionList:
print('rxnId: ', rxnId)
# indexCnt += 1
# if indexCnt > 2:
# break
# Process Annotation information
rxnData = getEntityData(rxnId)
if 'stableIdentifier' not in rxnData:
continue
stableId = rxnData['stableIdentifier']['displayName']
rxnUrl = '{}{}'.format(rxnUrlTpl, stableId)
rxnName = escapeBelString(rxnData['displayName'])
rxnType = rxnData['schemaClass']
# Todo collect all compartments and annotate
compartment = 'Unknown'
if 'compartment' in rxnData:
compartment = rxnData['compartment'][0]['displayName']
rxnAuthor = rxnDate = None
if 'created' in rxnData:
try:
matches = re.search(r'(.*?),\s+(\d{4,4}-\d{2,2}-\d{2,2})', rxnData['created']['displayName'])
if matches:
rxnAuthor = matches.group(1)
rxnDate = matches.group(2)
except:
log.info('Rxn - cannot find created date and author in object: {}'.format(rxnId))
if rxnDate and rxnAuthor:
citation = '{{"Online Resource", "{}", "{}", "{}", "{}"}}'.format(rxnName, rxnUrl, rxnDate, rxnAuthor)
else:
citation = '{{"Online Resource", "{}", "{}"}}'.format(rxnName, rxnUrl)
evidence = {
'name': rxnName,
'rxnId': rxnId, # TODO remove after debugging
'rxnType': rxnType,
'compartment': compartment,
'species': rxnData['speciesName'],
'species_tax_id': convertSpeciesNameToTaxId(rxnData['speciesName']),
'summary_text': rxnName,
'citation': citation,
}
# Process BEL Statement
catalysts, inputs, outputs = [], [], []
if 'catalystActivity' in rxnData:
for catalyst in rxnData['catalystActivity']:
catalysts.append(toBel(catalyst['dbId']))
# print('Catalyst: {}'.format(catalyst['dbId']))
if 'input' in rxnData:
for input in dedup(rxnData['input']):
inputs.append(toBel(input['dbId']))
if 'output' in rxnData:
for output in dedup(rxnData['output']):
outputs.append(toBel(output['dbId']))
print('Catalysts ', catalysts)
print('Inputs ', inputs)
print('Outputs ', outputs)
print('\n')
statements = buildStatements(catalysts, inputs, outputs)
evidence['statements'] = dedupList(statements)
bad_namespace_flag = False
for statement in statements:
if 'ENSEMBL' in statement or 'EMBL' in statement:
bad_namespace_flag = True
# with open('tmp_evidences.json', 'a') as f:
# json.dump(evidence, f, indent=4)
# f.write('\n\n')
if bad_namespace_flag:
bad_namespaces_evidences.append(copy.deepcopy(evidence))
else:
evidences.append(copy.deepcopy(evidence))
belscript = render_template(template_filename, evidences, pathways=pathways)
fn = 'reactome.bels'
if belversion == '2':
fn += '2'
with open(fn, 'w') as f:
f.write(belscript)
import json
with open('bad_evidences.json', 'w') as f:
json.dump(bad_namespaces_evidences, f, indent=4)
@click.command()
@click.option('--belversion', '-b', default='1', type=click.Choice(['1', '2']), help="Use Bel 1 by default or select Bel 2")
@click.option('--species', '-s', multiple=True, type=click.Choice(['all', 'Homo sapiens', 'Mus musculus', 'Rattus norvegicus']))
@click.option('--pathways', '-p', default=None, multiple=True, help="Restrict to specific Reactome Pathway(s) - e.g. Metabolism - can use multiple -p Metabolism -p Pathway2 ...")
def main(belversion, species, pathways):
"""Process Reactome into BEL
Example: ./processReactome.py -b 1 -s "Homo sapiens" -s "Mus musculus" -p Metabolism
Example: ./processReactome.py -b 2 -s "Homo sapiens" -p Metabolism -p "Transmembrane transport of small molecules"
Result: reactome.bels
"""
setBelVersion(belversion)
# quit and show help if no arguments are set
if not species:
click.help_option()
print('Here')
if 'all' in species:
species = speciesList
# Collect reactions
reactionList = []
for specie in species:
reactionList.extend(getReactions(specie, pathways=pathways))
# import json
# with open('reactionlist.json', 'w') as f:
# json.dump(reactionList, f, indent=4)
# quit()
# human, mouse and rat share rxnIds
reactionList = set(reactionList)
buildBelEvidences(reactionList, belversion, pathways=pathways)
# buildBelEvidences([('109514', 'Test')])
# # buildBelEvidences([('450092', 'Test')])
# quit()
# # Get Metabolism reactions for human only
# reactionList = getReactions('homo sapiens', pathway='Metabolism')
# quit()
# # Get all Reactome reactions for model organisms
# reactionList = []
# for species in ['Homo sapiens', 'Mus musculus', 'Rattus norvegicus']:
# reactionList.extend(getReactions(species))
if __name__ == '__main__':
main()