-
Notifications
You must be signed in to change notification settings - Fork 1
/
evaluation.py
235 lines (174 loc) · 8.23 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
from find_path import get_uri
import pandas as pd
from igraph import *
import numpy as np
import os
def ranked_comparison(output_dir,**value_dfs):
df = pd.DataFrame()
for i in value_dfs.items():
paths_list = list(i[1]['Value'])
r = [sorted(paths_list,reverse=True).index(x) for x in paths_list]
df[i[0]] = r
output_folder = output_dir+'/Evaluation_Files'
#Check for existence of output directory
if not os.path.exists(output_folder):
os.makedirs(output_folder)
df.to_csv(output_folder+'/ranked_comparison.csv',sep=',',index=False)
return df
def path_length_comparison(output_dir,input_nodes_df,labels_all,search_type,**subgraph_dfs):
df = pd.DataFrame()
for sg in subgraph_dfs.items():
sg_df = sg[1]
#Change order of columns for igraph object
sg_df = sg_df[['S', 'O', 'P']]
path_lengths = []
g = Graph.DataFrame(sg_df,directed=True,use_vids=False)
for i in range(len(input_nodes_df)):
#node1 = get_uri(labels_all,input_nodes_df.iloc[i].loc['source'])
node1 = input_nodes_df.iloc[i].loc['source_label']
node2 = input_nodes_df.iloc[i].loc['target_label']
p = g.get_all_shortest_paths(v=node1, to=node2, weights=None, mode=search_type)
path_lengths.append(len(p[0]))
df[sg[0]] = path_lengths
output_folder = output_dir+'/Evaluation_Files'
#Check for existence of output directory
if not os.path.exists(output_folder):
os.makedirs(output_folder)
df.to_csv(output_folder+'/path_length_comparison.csv',sep=',',index=False)
return df
def num_nodes_comparison(output_dir,**subgraph_dfs):
df = pd.DataFrame()
for sg in subgraph_dfs.items():
sg_df = sg[1]
n = pd.unique(sg_df[['S', 'O']].values.ravel())
df[sg[0]] = [len(n)]
output_folder = output_dir+'/Evaluation_Files'
#Check for existence of output directory
if not os.path.exists(output_folder):
os.makedirs(output_folder)
df.to_csv(output_folder+'/num_nodes_comparison.csv',sep=',',index=False)
return df
def get_ontology_lables(noa_df,labels_all,kg_type):
ont_types = ['/CHEBI_','/PR_','/PW_','/gene','/MONDO_','/HP_','/VO_','/EFO_','NCBITaxon_','/GO_','/DOID_','/reactome','/SO_',
'ENSEMBL:','UniProt','GO:','NCBIGene','CHEMBL.',]
ont_labels = []
num_intermediate_nodes = 0
#Get all intermediate nodes from subgraph
for i in range(len(noa_df)):
ont_found = 'false'
if noa_df.iloc[i].loc['Attribute'] == 'Extra':
uri = get_uri(labels_all,noa_df.iloc[i].loc['Node'],kg_type)
num_intermediate_nodes += 1
for j in ont_types:
if j in uri:
ont_labels.append(j)
ont_found = 'true'
if ont_found == 'false':
print('Ontology not accounted for in list: ',uri)
raise Exception('Ontology type not accounted for in list: ',uri,', add this ontology type to get_ontology_labels function (evaluation.py).')
ont_labels, counts = np.unique(ont_labels,return_counts=True)
ont_labels = ont_labels.tolist()
counts = counts.tolist()
return ont_labels, counts, num_intermediate_nodes
def intermediate_nodes_comparison(output_dir,labels_all,kg_type,**noa_dfs):
all_ont_labels = []
df = pd.DataFrame()
#Get all possible ontology types from all subgraphs given
onts_used = []
for nd in noa_dfs.items():
n_df = nd[1]
#Get unique ontology types from this subgraph, add to running list for each subgraph, counts not used here
ont_labels, counts, num_intermediate_nodes = get_ontology_lables(n_df,labels_all,kg_type)
all_ont_labels.extend(ont_labels)
#List of all unique ontology types from all subgraphs
all_ont_labels = np.unique(all_ont_labels)
#Add all unique ont labels to df
df['Ontology_Type'] = all_ont_labels
df.sort_values(by=['Ontology_Type'], ascending=(True),inplace=True)
#Get counts of each ontology type
for nd in noa_dfs.items():
values = []
n_df = nd[1]
ont_labels, counts, num_intermediate_nodes = get_ontology_lables(n_df,labels_all,kg_type)
#Add any ontology types not already in subgraph
for i in all_ont_labels:
if i not in ont_labels:
ont_labels.append(i)
counts.append(0)
#Normalize counts
counts_norm = [i/num_intermediate_nodes for i in counts]
onts_dict = {ont_labels[i]: counts_norm[i] for i in range(len(ont_labels))}
#Sort dict the same way as df is sorted
for key in sorted(onts_dict.keys()):
values.append(onts_dict[key])
df[nd[0]] = values
output_folder = output_dir+'/Evaluation_Files'
#Check for existence of output directory
if not os.path.exists(output_folder):
os.makedirs(output_folder)
df.to_csv(output_folder+'/intermediate_nodes_comparison.csv',sep=',',index=False)
return df
def edge_type_comparison(output_dir,**subgraph_dfs):
all_edge_labels = []
df = pd.DataFrame()
#Get all possible edge types from all subgraphs given
for sg in subgraph_dfs.items():
sg_df = sg[1]
#Get unique edge types from this subgraph, add to running list for each subgraph, counts not used here
edge_labels, counts = np.unique(sg_df['P'], return_counts=True)
all_edge_labels.extend(edge_labels)
#List of all unique ontology types from all subgraphs
all_edge_labels = np.unique(all_edge_labels)
#Add all unique edge labels to df
df['Edge_Type'] = all_edge_labels
df.sort_values(by=['Edge_Type'], ascending=(True),inplace=True)
for sg in subgraph_dfs.items():
values = []
sg_df = sg[1]
#Need to account for the fact that ont types will be different for each sg_df (i.e. values)
edge_labels, counts = np.unique(sg_df['P'], return_counts=True)
edge_labels = list(edge_labels)
counts = list(counts)
#Add any edge types not already in subgraph
for i in all_edge_labels:
if i not in edge_labels:
edge_labels.append(i)
counts.append(0)
#Normalize counts
counts_norm = [i/len(sg_df['P']) for i in counts]
edge_dict = {edge_labels[i]: counts_norm[i] for i in range(len(edge_labels))}
#Sort dict the same way as df is sorted
for key in sorted(edge_dict.keys()):
values.append(edge_dict[key])
df[sg[0]] = values
output_folder = output_dir+'/Evaluation_Files'
#Check for existence of output directory
if not os.path.exists(output_folder):
os.makedirs(output_folder)
df.to_csv(output_folder+'/edge_type_comparison.csv',sep=',',index=False)
return df
#Gets subgraph df for specific algorithm, supporting types are CosineSimilarity and PDP
def get_subgraph_dfs(output_dir,subgraph_algorithm):
input_nodes_file = output_dir+'/_Input_Nodes_.csv'
input_nodes = pd.read_csv(input_nodes_file, sep = "|")
subgraph_file = output_dir+'/'+subgraph_algorithm+'/Subgraph.csv'
subgraph_df = pd.read_csv(subgraph_file, sep = "|")
noa_file = output_dir+'/'+subgraph_algorithm+'/Subgraph_attributes.noa'
noa_df = pd.read_csv(noa_file, sep = "|")
path_list_file = output_dir+'/Evaluation_Files/paths_list_'+subgraph_algorithm+'.csv'
path_list = pd.read_csv(path_list_file, sep=",")
return input_nodes,subgraph_df,noa_df,path_list
def output_path_lists(output_dir,path_list,subgraph_algorithm,idx):
df = pd.DataFrame()
df['Value'] = path_list
output_folder = output_dir+'/Evaluation_Files'
#Check for existence of output directory
if not os.path.exists(output_folder):
os.makedirs(output_folder)
df.to_csv(output_folder+'/paths_list_'+subgraph_algorithm+'_'+str(idx)+'.csv',sep=',',index=False)
def output_num_paths_pairs(output_dir,num_paths_df,subgraph_algorithm):
output_folder = output_dir+'/Evaluation_Files'
#Check for existence of output directory
if not os.path.exists(output_folder):
os.makedirs(output_folder)
num_paths_df.to_csv(output_folder+'/num_paths_'+subgraph_algorithm+'.csv',sep=',',index=False)