-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
208 lines (145 loc) · 7.29 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import os
import re
import json
import math
import pickle
import pandas as pd
import make_networks
import networkx as nx
import download_files
if __name__ == '__main__':
# Get edges from string (make initial networks)
# Get node properties from string (add to networks)
# Get node properties from Uniprot (add to networks)
# Get node values during infection from ViPR (add to networks)
# Make initial networks in Python → NetworkX objects (save networks to files)
# Trim networks with SPRAS (trim networks, save to files)
# Network measures → Store values on nodes, edges, whole graph (add to networks)
# Network perturbations → Store values on nodes and edges (add to networks)
# ----------- make directories -----------
data = 'data'
if not os.path.exists(data):
os.makedirs(data)
measures = os.path.join('data', 'measures')
if not os.path.exists(measures):
os.makedirs(measures)
input_files = os.path.join('data', 'input_files')
if not os.path.exists(input_files):
os.makedirs(input_files)
networks = 'networks'
if not os.path.exists(networks):
os.makedirs(networks)
cytoscape = os.path.join('networks', 'cytoscape')
if not os.path.exists(cytoscape):
os.makedirs(cytoscape)
full = os.path.join('networks', 'full')
if not os.path.exists(full):
os.makedirs(full)
trimmed = os.path.join('networks', 'trimmed')
if not os.path.exists(trimmed):
os.makedirs(trimmed)
final = os.path.join('networks', 'final')
if not os.path.exists(final):
os.makedirs(final)
# ----------- download input files -----------
source = 'string_edges'
file_downloader = download_files.FileDownloader()
file_downloader.single_file(source)
# ----------- make full networks -----------
network_maker = make_networks.MakeNetworks()
# save edge-only network to file
edge_only_network_file = os.path.join(full, 'edges-only.p')
#network_maker.make_edge_network(os.path.join('data', 'input_files', 'string_edges.txt'), edge_only_network_file)
# get node list for CHTC
#node_list_file_basename = os.path.join(full, 'pid_input_files', 'protein_ids_')
#network_maker.make_node_list_chtc(edge_only_network_file, node_list_file_basename)
# collect uniprot data using CHTC
# copy CHTC/get_uniprot_data.py and pid_input_files to HTC.learn
# transfer results/ from HTC back to networks/full
# add uniprot node information to networks
whole_network_file = os.path.join(full, 'whole-network.p')
#network_maker.add_uniprot_node_data(edge_only_network_file, whole_network_file)
# separate this full network into virus-host pairs (1:1)
whole_11_networks = os.path.join(full, 'whole-11-networks.p')
network_maker.split_into_subnetworks(whole_network_file, whole_11_networks)
quit()
# ----------- trim networks -----------
# use SPRAS and ViPR to trim networks
# ----------- measure networks -----------
# measure networks
# ----------- plot results -----------
edges_file = os.path.join(data_dir, 'protein.links.full.v10.5.txt')
nodes_dir = os.path.join(data_dir)
networks_file_out = os.path.join('OLD/networks', 'string_networks.p')
# Evidence-based and textmined separate networks
# 1. Filter by edge type and remove lone nodes
# 2. Filter nodes to ones that only contain virus-host nodes
# 3. Take subgraph of those nodes only
filtered_experiments_networks_file_out = os.path.join('OLD/networks', 'filtered_experiments_string_networks.p')
filtered_textmining_networks_file_out = os.path.join('OLD/networks', 'filtered_textmining_string_networks.p')
measured_experiments_networks_file_out = os.path.join('OLD/networks', 'measured_experiments_string_networks.p')
measured_textmining_networks_file_out = os.path.join('OLD/networks', 'measured_textmining_string_networks.p')
# check to see if filtered networks are already made
if not os.path.exists(filtered_experiments_networks_file_out) and not \
os.path.exists(filtered_textmining_networks_file_out):
# check to see if networks are already made
if os.path.exists(networks_file_out):
# if so, load them in
with open(networks_file_out, 'rb') as f:
networks = pickle.load(f)
# if not, then make the networks for the first time
else:
networks = network_maker.virus_string_networks(edges_file, nodes_dir, networks_file_out)
# filter networks for first time
networks_filtnetworks_filtered_experiments, networks_filtered_textminingered = \
network_maker.filter_networks(networks, filtered_experiments_networks_file_out,
filtered_textmining_networks_file_out)
else:
with open(filtered_experiments_networks_file_out, 'rb') as f:
filtered_experiments_networks = pickle.load(f)
with open(filtered_textmining_networks_file_out, 'rb') as f:
filtered_textmining_networks = pickle.load(f)
# ----------- trim networks with SPRAS (CHTC?) -----------
# download and use proteomic data
# ----------- measure networks (refactor for CHTC) -----------
# just basic measures
# add complexity measures
# add perturbation measures
# add rows for edges too--> complexity perturbation measures, attribute information
network_ids = list(filtered_experiments_networks.keys())
# serial calculation
[network_maker.make_df(network_id, filtered_experiments_networks_file_out) for network_id in network_ids]
[network_maker.make_df(network_id, filtered_textmining_networks_file_out) for network_id in network_ids]
quit()
# for multiprocessing (these measures sure are slow...)
#with Pool(multiprocessing.cpu_count()) as p:
# p.map(functions_for_parallelization.make_df, network_ids)
# ----------- add measure values back into network as attributes and export to cytoscape -----------
for network_id in network_ids:
# grab the network
network = filtered_experiments_networks[network_id]
# load in the csv of measures
try:
measures_data = pd.read_csv(os.path.join('OLD/data_jar', 'measures_experiments_' + network_id + '.csv'))
except:
continue
# for each node, add measure attributes
measures = list(measures_data.columns)[12:20]
for index, row in measures_data.iterrows():
node_name = dict(row)['object_name']
for measure in measures:
value = dict(row)[measure]
# if not a float, will cause an issue in cytoscape
if not isinstance(value, (int, float)) or math.isnan(value) or math.isinf(value):
value = 0.0
network.nodes[node_name][measure] = value
# also print the size of the graph so I know which to visualize in cytoscape
print(str(network_id))
print(network)
print()
# make into a cytoscape object and save to folder
network = nx.cytoscape_data(network)
# get rid of space in file name
network_id = re.sub(' ', '_', network_id)
with open(os.path.join(cytoscape_jar, network_id + '.cyjs'), 'w') as outfile:
json.dump(network, outfile)