This repository has been archived by the owner on Sep 24, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
orthology.py
executable file
·158 lines (146 loc) · 6.53 KB
/
orthology.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
# coding: utf-8
import argparse
import os
import pickle
import bel_functions
# Info for BEL document header
doc_name = "BEL Framework Orthologous Genes Document"
description = "Gene orthology relationships from HGNC, RGD, and Homologene"
namespaces = {'HGNC': 'hgnc-human-genes.belns',
'MGI': 'mgi-mouse-genes.belns',
'RGD': 'rgd-rat-genes.belns',
'EGID': 'entrez-gene-ids.belns'}
annotations = {}
base_url = 'http://resource.belframework.org/belframework/testing/'
def get_egid_status(egid):
''' Check egid for update or withdrawal. '''
status = data_dict.get('egid_history').get_id_update(term_id)
return status
if __name__ == '__main__':
# command line arguments - directory for pickled data objects
parser = argparse.ArgumentParser(description="""Generate BEL orthology file from pickled data objects.""")
parser.add_argument(
"-n",
required=True,
metavar="DIRECTORY",
help="directory to store the new namespace equivalence data")
args = parser.parse_args()
if os.path.exists(args.n):
os.chdir(args.n)
else:
print('data directory {0} not found!'.format(args.n))
# access and load required data objects
data_list = [
'rgd',
'hgnc',
'rgd_ortho',
'mgi',
'egid_ortho',
'egid_history']
data_dict = {}
for files in os.listdir("."):
if files.endswith("parsed_data.pickle"):
with open(files, 'rb') as f:
d = pickle.load(f)
if str(d) in data_list:
data_dict[str(d)] = d
for data in data_list:
if data not in data_dict.keys():
print('missing required dependency {0}!'.format(data))
# Get ortho statements from HGNC data object
# MGI and RGD namespaceDataSet objects are required for translation of MGI
# and RGD Ids to labels
hgnc_ortho_statements = set()
for term_id in data_dict.get('hgnc').get_values():
term_label = data_dict.get('hgnc').get_label(term_id)
hgnc_term = bel_functions.bel_term(term_label, 'HGNC', 'g')
orthos = data_dict.get('hgnc').get_orthologs(term_id)
if orthos is not None:
for o in orthos:
if len(o.split(':')) == 2:
prefix, value = o.split(':')
if prefix == 'RGD':
o_label = data_dict.get('rgd').get_label(value)
if prefix == 'MGI':
o_label = data_dict.get('mgi').get_label(value)
if o_label is None:
print(
'WARNING - missing label for {0}, {1}'.format(prefix, value))
continue
ortho_term = bel_functions.bel_term(o_label, prefix, 'g')
hgnc_ortho_statements.add(
'{0} orthologous {1}'.format(
hgnc_term, ortho_term))
# Get ortho statements from RGD data object (Rat to Mouse only)
# requires HGNC and MGI Namespace DataSet onjects to translate HGNC and
# MGI Ids to labels
rgd_ortho_statements = set()
for term_id in data_dict.get('rgd_ortho').get_values():
term_label = data_dict.get('rgd').get_label(term_id)
rgd_term = bel_functions.bel_term(term_label, 'RGD', 'g')
orthos = data_dict.get('rgd_ortho').get_orthologs(term_id)
if orthos is not None:
for o in orthos:
prefix = ''
if len(o.split(':')) == 2:
prefix, value = o.split(':')
if prefix == 'HGNC':
# o_label = data_dict.get('hgnc').get_label(value)
continue
elif prefix == 'MGI':
o_label = data_dict.get('mgi').get_label(value)
if o_label is None:
print(
'WARNING - {2} missing label for {0}, {1}'.format(prefix, value, rgd_term))
continue
ortho_term = bel_functions.bel_term(o_label, prefix, 'g')
rgd_ortho_statements.add(
'{0} orthologous {1}'.format(
rgd_term, ortho_term))
# Get ortho statements from Homologene (egid_ortho) data object
egid_ortho_statements = set()
for term_id in data_dict.get('egid_ortho').get_values():
orthos = data_dict.get('egid_ortho').get_orthologs(term_id)
status = data_dict.get('egid_history').get_id_update(term_id)
if status == 'withdrawn':
continue
elif status is not None:
term_id = status # if replacement egid available, use it
egid_term = bel_functions.bel_term(term_id, 'EGID', 'g')
if orthos is not None:
for o in orthos:
prefix = ''
if len(o.split(':')) == 2:
prefix, value = o.split(':')
status = data_dict.get('egid_history').get_id_update(value)
if status == 'withdrawn':
continue
elif status is not None:
value = status # if replacement egid value availble from history, use it
ortho_term = bel_functions.bel_term(value, prefix, 'g')
egid_ortho_statements.add(
'{0} orthologous {1}'.format(
egid_term, ortho_term))
with open('gene-orthology.bel', 'w') as ortho:
bel_functions.write_bel_header(
ortho,
doc_name=doc_name,
description=description,
namespaces=namespaces,
annotations=annotations,
base_url=base_url)
ortho.write(
'SET Citation = {"Online Resource", "HUGO Gene Nomenclature Committee data download", "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc_complete_set.txt.gz"}\n')
for s in sorted(hgnc_ortho_statements):
ortho.write(s + '\n')
ortho.write('\n')
ortho.write(
'SET Citation = {"Online Resource","RGD Orthology FTP file", "ftp://ftp.rgd.mcw.edu/pub/data_release/RGD_ORTHOLOGS.txt"}\n')
for s in sorted(rgd_ortho_statements):
ortho.write(s + '\n')
ortho.write('\n')
ortho.write(
'SET Citation = {"Online Resource","NCBI Homologene FTP file", "ftp://ftp.ncbi.nih.gov/pub/HomoloGene/current"}\n')
for s in sorted(egid_ortho_statements):
ortho.write(s + '\n')