This repository has been archived by the owner on Sep 24, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
gene_scaffolding.py
executable file
·138 lines (120 loc) · 5.22 KB
/
gene_scaffolding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python3
import datetime
import argparse
import os
from string import Template
from bel_functions import bel_term
""" Generates Gene Scaffolding BEL script document used in Phase III.
Given the urls for gene/protein domain .belns files, uses the
encodings to generate transcribedTo and translatedTo BEL statements. """
namespaces = {'HGNC': 'hgnc-human-genes.belns',
'MGI': 'mgi-mouse-genes.belns',
'RGD': 'rgd-rat-genes.belns'}
base_url = 'http://resources.openbel.org/belframework/testing/namespace/'
output_file = 'gene_scaffolding_document_9606_10090_10116.bel'
def translated_to(value, ns):
""" Create bel translatedTo statement, given
namespace keyword string and value. """
source = bel_term(value, ns, 'r')
target = bel_term(value, ns, 'p')
s = Template('${source} translatedTo ${target}').substitute(
source=source, target=target)
return s
def transcribed_to(value, ns):
""" Create bel transcribedTo statement, given
namespace keyword string and value. """
source = bel_term(value, ns, 'g')
target = bel_term(value, ns, 'r')
s = Template('${source} transcribedTo ${target}').substitute(
source=source, target=target)
return s
def micro_rna(value, ns):
""" Create bel transcribedTo statement for microRNA,
given namespace keyword string and value. """
source = bel_term(value, ns, 'g')
target = bel_term(value, ns, 'm')
s = Template('${source} transcribedTo ${target}').substitute(
source=source, target=target)
return s
def scaffold(belns_filename, ns):
""" Returns set of gene scaffolding statements from .belns,
along with species, name, and date values needed for annotations. """
field = ''
statements = set()
with open(belns_filename, 'r') as belns:
for line in iter(belns):
# line = line.decode('ISO-8859-1')
if not line.strip():
continue
if line.startswith('['):
field = line.strip()
continue
elif '[Namespace]' in field:
if line.startswith('SpeciesString='):
species = line.split('=')[1].strip()
elif line.startswith('NameString='):
name = line.split('=')[1].strip()
elif line.startswith('CreatedDateTime'):
date = line.split('=')[1].strip()
elif '[Values]' in field:
(value, encoding) = line.split('|')
encoding = encoding.strip()
if encoding == 'G':
continue
elif encoding == 'GR':
statements.add(transcribed_to(value, ns))
elif encoding == 'GRM':
statements.add(micro_rna(value, ns))
elif encoding == 'GRP':
statements.add(transcribed_to(value, ns))
statements.add(translated_to(value, ns))
return statements, name, species, date
annotations = {
'Species': 'http://resources.openbel.org/belframework/testing/annotation/species-taxonomy-id.belanno'}
today = datetime.date.today()
version = today.strftime('%Y%m%d')
separator = '#' * 50
parser = argparse.ArgumentParser(
description="""Gene scaffolding from HGNC, RGD, and MGI .belns files.""")
parser.add_argument("-n", required=True, metavar="DIRECTORY",
help="directory with .belns files")
args = parser.parse_args()
belns_dir = args.n
if os.path.exists(args.n):
os.chdir(args.n)
else:
print('Data directory {0} not found'.format(args.n))
exit()
with open(output_file, 'w') as bel:
print('\nWriting file {0} ...'.format(output_file))
bel.write(separator)
bel.write('\n# Document Properties Section\n')
bel.write('SET DOCUMENT Name = "Phase III Gene Scaffolding"\n')
bel.write(
'SET DOCUMENT Description = "Gene scaffolding for use with the BEL compiler."\n')
bel.write('SET DOCUMENT Version = "{0}"\n'.format(version))
bel.write('SET DOCUMENT Copyright = "Copyright (c) {0}, OpenBEL Project. This work is licensed under a Creative Commons Attribution 3.0 Unported License."\n'.format(
str(today.year)))
bel.write('SET DOCUMENT Authors = "OpenBEL"\n')
bel.write('\n' + separator + '\n')
bel.write('# Definitions Section\n')
for ns_prefix, ns_name in namespaces.items():
bel.write(
'DEFINE NAMESPACE {0} AS URL "{1}{2}"\n'.format(
ns_prefix, base_url, ns_name))
bel.write('\n')
for anno, url in annotations.items():
bel.write('DEFINE ANNOTATION {0} AS URL "{1}"\n'.format(anno, url))
bel.write(separator + '\n')
bel.write('# Statements Section\n\n')
for ns_prefix, ns_name in namespaces.items():
statements, name, species, date = scaffold(ns_name, ns_prefix)
bel.write('SET Species = {0}\n'.format(species))
bel.write(
'SET Citation = {3}"Online Resource", "{0}", "{1}{5}", "{2}", "", ""{4}\n\n'.format(
name, base_url, date, '{', '}', ns_name))
print('\n\tGenerated {0} scaffolding statements for {1}. '.format(
str(len(statements)), name))
for s in sorted(statements):
bel.write(s + '\n')
bel.write('\n\n')