forked from lmcinnes/kepler-mapper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
km.py
392 lines (347 loc) · 17.3 KB
/
km.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
from __future__ import division
import numpy as np
from collections import defaultdict
import json
import itertools
from sklearn import cluster, preprocessing, manifold
from datetime import datetime
import sys
class KeplerMapper(object):
# With this class you can build topological networks from (high-dimensional) data.
#
# 1) Fit a projection/lens/function to a dataset and transform it.
# For instance "mean_of_row(x) for x in X"
# 2) Map this projection with overlapping intervals/hypercubes.
# Cluster the points inside the interval
# (Note: we cluster on the inverse image/original data to lessen projection loss).
# If two clusters/nodes have the same members (due to the overlap), then:
# connect these with an edge.
# 3) Visualize the network using HTML and D3.js.
#
# functions
# ---------
# fit_transform: Create a projection (lens) from a dataset
# map: Apply Mapper algorithm on this projection and build a simplicial complex
# visualize: Turns the complex dictionary into a HTML/D3.js visualization
def __init__(self, verbose=2):
self.verbose = verbose
self.chunk_dist = []
self.overlap_dist = []
self.d = []
self.nr_cubes = 0
self.overlap_perc = 0
self.clusterer = False
def fit_transform(self, X, projection="sum", scaler=preprocessing.MinMaxScaler()):
# Creates the projection/lens from X.
#
# Input: X. Input features as a numpy array.
# Output: projected_X. original data transformed to a projection (lens).
#
# parameters
# ----------
# projection: Projection parameter is either a string,
# a scikit class with fit_transform, like manifold.TSNE(),
# or a list of dimension indices.
# scaler: if None, do no scaling, else apply scaling to the projection
# Default: Min-Max scaling
self.scaler = scaler
self.projection = str(projection)
# Detect if projection is a class (for scikit-learn)
if str(type(projection))[1:6] == "class": #TODO: de-ugly-fy
reducer = projection
if self.verbose > 0:
try:
projection.set_params(**{"verbose":self.verbose})
except:
pass
print("\n..Projecting data using: \n\t%s\n"%str(projection))
X = reducer.fit_transform(X)
# Detect if projection is a string (for standard functions)
if isinstance(projection, str):
if self.verbose > 0:
print("\n..Projecting data using: %s"%(projection))
# Stats lenses
if projection == "sum": # sum of row
X = np.sum(X, axis=1).reshape((X.shape[0],1))
if projection == "mean": # mean of row
X = np.mean(X, axis=1).reshape((X.shape[0],1))
if projection == "median": # mean of row
X = np.median(X, axis=1).reshape((X.shape[0],1))
if projection == "max": # max of row
X = np.max(X, axis=1).reshape((X.shape[0],1))
if projection == "min": # min of row
X = np.min(X, axis=1).reshape((X.shape[0],1))
if projection == "std": # std of row
X = np.std(X, axis=1).reshape((X.shape[0],1))
if projection == "dist_mean": # Distance of x to mean of X
X_mean = np.mean(X, axis=0)
X = np.sum(np.sqrt((X - X_mean)**2), axis=1).reshape((X.shape[0],1))
# Detect if projection is a list (with dimension indices)
if isinstance(projection, list):
if self.verbose > 0:
print("\n..Projecting data using: %s"%(str(projection)))
X = X[:,np.array(projection)]
# Scaling
if scaler is not None:
if self.verbose > 0:
print("\n..Scaling with: %s\n"%str(scaler))
X = scaler.fit_transform(X)
return X
def map(self, projected_X, inverse_X=None, clusterer=cluster.DBSCAN(eps=0.5,min_samples=3), nr_cubes=10, overlap_perc=0.1):
# This maps the data to a simplicial complex. Returns a dictionary with nodes and links.
#
# Input: projected_X. A Numpy array with the projection/lens.
# Output: complex. A dictionary with "nodes", "links" and "meta information"
#
# parameters
# ----------
# projected_X projected_X. A Numpy array with the projection/lens. Required.
# inverse_X Numpy array or None. If None then the projection itself is used for clustering.
# clusterer Scikit-learn API compatible clustering algorithm. Default: DBSCAN
# nr_cubes Int. The number of intervals/hypercubes to create.
# overlap_perc Float. The percentage of overlap "between" the intervals/hypercubes.
start = datetime.now()
# Helper function
def cube_coordinates_all(nr_cubes, nr_dimensions):
# Helper function to get origin coordinates for our intervals/hypercubes
# Useful for looping no matter the number of cubes or dimensions
# Example: if there are 4 cubes per dimension and 3 dimensions
# return the bottom left (origin) coordinates of 64 hypercubes,
# as a sorted list of Numpy arrays
# TODO: elegance-ify...
l = []
for x in range(nr_cubes):
l += [x] * nr_dimensions
return [np.array(list(f)) for f in sorted(set(itertools.permutations(l,nr_dimensions)))]
nodes = defaultdict(list)
links = defaultdict(list)
complex = {}
self.nr_cubes = nr_cubes
self.clusterer = clusterer
self.overlap_perc = overlap_perc
if self.verbose > 0:
print("Mapping on data shaped %s using dimensions\n"%(str(projected_X.shape)))
# If inverse image is not provided, we use the projection as the inverse image (suffer projection loss)
if inverse_X is None:
inverse_X = projected_X
# We chop up the min-max column ranges into 'nr_cubes' parts
self.chunk_dist = (np.max(projected_X, axis=0) - np.min(projected_X, axis=0))/nr_cubes
# We calculate the overlapping windows distance
self.overlap_dist = self.overlap_perc * self.chunk_dist
# We find our starting point
self.d = np.min(projected_X, axis=0)
# Use a dimension index array on the projected X
# (For now this uses the entire dimensionality, but we keep for experimentation)
di = np.array([x for x in range(projected_X.shape[1])])
# Prefix'ing the data with ID's
ids = np.array([x for x in range(projected_X.shape[0])])
projected_X = np.c_[ids,projected_X]
inverse_X = np.c_[ids,inverse_X]
# Subdivide the projected data X in intervals/hypercubes with overlap
if self.verbose > 0:
total_cubes = len(cube_coordinates_all(nr_cubes,projected_X.shape[1]))
print("Creating %s hypercubes."%total_cubes)
for i, coor in enumerate(cube_coordinates_all(nr_cubes,di.shape[0])):
# Slice the hypercube
hypercube = projected_X[ np.invert(np.any((projected_X[:,di+1] >= self.d[di] + (coor * self.chunk_dist[di])) &
(projected_X[:,di+1] < self.d[di] + (coor * self.chunk_dist[di]) + self.chunk_dist[di] + self.overlap_dist[di]) == False, axis=1 )) ]
if self.verbose > 1:
print("There are %s points in cube_%s / %s with starting range %s"%
(hypercube.shape[0],i,total_cubes,self.d[di] + (coor * self.chunk_dist[di])))
# If at least one sample inside the hypercube
if hypercube.shape[0] > 0:
# Cluster the data point(s) in the cube, skipping the id-column
# Note that we apply clustering on the inverse image (original data samples) that fall inside the cube.
inverse_x = inverse_X[[int(nn) for nn in hypercube[:,0]]]
clusterer.fit(inverse_x[:,1:])
if self.verbose > 1:
print("Found %s clusters in cube_%s\n"%(np.unique(clusterer.labels_[clusterer.labels_ > -1]).shape[0],i))
#Now for every (sample id in cube, predicted cluster label)
for a in np.c_[hypercube[:,0],clusterer.labels_]:
if a[1] != -1: #if not predicted as noise
cluster_id = str(coor[0])+"_"+str(i)+"_"+str(a[1])+"_"+str(coor)+"_"+str(self.d[di] + (coor * self.chunk_dist[di])) # TODO: de-rudimentary-ify
nodes[cluster_id].append( int(a[0]) ) # Append the member id's as integers
else:
if self.verbose > 1:
print("Cube_%s is empty.\n"%(i))
# Create links when clusters from different hypercubes have members with the same sample id.
candidates = itertools.combinations(nodes.keys(),2)
for candidate in candidates:
# if there are non-unique members in the union
if len(nodes[candidate[0]]+nodes[candidate[1]]) != len(set(nodes[candidate[0]]+nodes[candidate[1]])):
links[candidate[0]].append( candidate[1] )
# Reporting
if self.verbose > 0:
nr_links = 0
for k in links:
nr_links += len(links[k])
print("\ncreated %s edges and %s nodes in %s."%(nr_links,len(nodes),str(datetime.now()-start)))
complex["nodes"] = nodes
complex["links"] = links
complex["meta"] = self.projection
return complex
def visualize(self, complex, color_function="", path_html="mapper_visualization_output.html", title="My Data",
graph_link_distance=30, graph_gravity=0.1, graph_charge=-120, custom_tooltips=None, width_html=0,
height_html=0, show_tooltips=True, show_title=True, show_meta=True):
# Turns the dictionary 'complex' in a html file with d3.js
#
# Input: complex. Dictionary (output from calling .map())
# Output: a HTML page saved as a file in 'path_html'.
#
# parameters
# ----------
# color_function string. Not fully implemented. Default: "" (distance to origin)
# path_html file path as string. Where to save the HTML page.
# title string. HTML page document title and first heading.
# graph_link_distance int. Edge length.
# graph_gravity float. "Gravity" to center of layout.
# graph_charge int. charge between nodes.
# custom_tooltips None or Numpy Array. You could use "y"-label array for this.
# width_html int. Width of canvas. Default: 0 (full width)
# height_html int. Height of canvas. Default: 0 (full height)
# show_tooltips bool. default:True
# show_title bool. default:True
# show_meta bool. default:True
# Format JSON for D3 graph
json_s = {}
json_s["nodes"] = []
json_s["links"] = []
k2e = {} # a key to incremental int dict, used for id's when linking
for e, k in enumerate(complex["nodes"]):
# Tooltip and node color formatting, TODO: de-mess-ify
if custom_tooltips is not None:
tooltip_s = "<h2>Cluster %s</h2>"%k + " ".join([str(f) for f in custom_tooltips[complex["nodes"][k]]])
if color_function == "average_signal_cluster":
tooltip_i = int(((sum([f for f in custom_tooltips[complex["nodes"][k]]]) / len(custom_tooltips[complex["nodes"][k]])) * 30) )
json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(tooltip_i)})
else:
json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(k.split("_")[0])})
else:
tooltip_s = "<h2>Cluster %s</h2>Contains %s members."%(k,len(complex["nodes"][k]))
json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(k.split("_")[0])})
k2e[k] = e
for k in complex["links"]:
for link in complex["links"][k]:
json_s["links"].append({"source": k2e[k], "target":k2e[link],"value":1})
# Width and height of graph in HTML output
if width_html == 0:
width_css = "100%"
width_js = 'document.getElementById("holder").offsetWidth-20'
else:
width_css = "%spx" % width_html
width_js = "%s" % width_html
if height_html == 0:
height_css = "100%"
height_js = 'document.getElementById("holder").offsetHeight-20'
else:
height_css = "%spx" % height_html
height_js = "%s" % height_html
# Whether to show certain UI elements or not
if show_tooltips == False:
tooltips_display = "display: none;"
else:
tooltips_display = ""
if show_meta == False:
meta_display = "display: none;"
else:
meta_display = ""
if show_title == False:
title_display = "display: none;"
else:
title_display = ""
with open(path_html,"wb") as outfile:
html = """<!DOCTYPE html>
<meta charset="utf-8">
<meta name="generator" content="KeplerMapper">
<title>%s | KeplerMapper</title>
<link href='https://fonts.googleapis.com/css?family=Roboto:700,300' rel='stylesheet' type='text/css'>
<style>
* {margin: 0; padding: 0;}
html { height: 100%%;}
body {background: #111; height: 100%%; font: 100 16px Roboto, Sans-serif;}
.link { stroke: #999; stroke-opacity: .333; }
.divs div { border-radius: 50%%; background: red; position: absolute; }
.divs { position: absolute; top: 0; left: 0; }
#holder { position: relative; width: %s; height: %s; background: #111; display: block;}
h1 { %s padding: 20px; color: #fafafa; text-shadow: 0px 1px #000,0px -1px #000; position: absolute; font: 300 30px Roboto, Sans-serif;}
h2 { text-shadow: 0px 1px #000,0px -1px #000; font: 700 16px Roboto, Sans-serif;}
.meta { position: absolute; opacity: 0.9; width: 220px; top: 80px; left: 20px; display: block; %s background: #000; line-height: 25px; color: #fafafa; border: 20px solid #000; font: 100 16px Roboto, Sans-serif;}
div.tooltip { position: absolute; width: 380px; display: block; %s padding: 20px; background: #000; border: 0px; border-radius: 3px; pointer-events: none; z-index: 999; color: #FAFAFA;}
}
</style>
<body>
<div id="holder">
<h1>%s</h1>
<p class="meta">
<b>Lens</b><br>%s<br><br>
<b>Cubes per dimension</b><br>%s<br><br>
<b>Overlap percentage</b><br>%s%%<br><br>
<b>Color Function</b><br>%s( %s )<br><br>
<b>Clusterer</b><br>%s<br><br>
<b>Scaler</b><br>%s
</p>
</div>
<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js"></script>
<script>
var width = %s,
height = %s;
var color = d3.scale.ordinal()
.domain(["0","1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30"])
.range(["#FF0000","#FF1400","#FF2800","#FF3c00","#FF5000","#FF6400","#FF7800","#FF8c00","#FFa000","#FFb400","#FFc800","#FFdc00","#FFf000","#fdff00","#b0ff00","#65ff00","#17ff00","#00ff36","#00ff83","#00ffd0","#00e4ff","#00c4ff","#00a4ff","#00a4ff","#0084ff","#0064ff","#0044ff","#0022ff","#0002ff","#0100ff","#0300ff","#0500ff"]);
var force = d3.layout.force()
.charge(%s)
.linkDistance(%s)
.gravity(%s)
.size([width, height]);
var svg = d3.select("#holder").append("svg")
.attr("width", width)
.attr("height", height);
var div = d3.select("#holder").append("div")
.attr("class", "tooltip")
.style("opacity", 0.0);
var divs = d3.select('#holder').append('div')
.attr('class', 'divs')
.attr('style', function(d) { return 'overflow: hidden; width: ' + width + 'px; height: ' + height + 'px;'; });
graph = %s;
force
.nodes(graph.nodes)
.links(graph.links)
.start();
var link = svg.selectAll(".link")
.data(graph.links)
.enter().append("line")
.attr("class", "link")
.style("stroke-width", function(d) { return Math.sqrt(d.value); });
var node = divs.selectAll('div')
.data(graph.nodes)
.enter().append('div')
.on("mouseover", function(d) {
div.transition()
.duration(200)
.style("opacity", .9);
div .html(d.tooltip + "<br/>")
.style("left", (d3.event.pageX + 100) + "px")
.style("top", (d3.event.pageY - 28) + "px");
})
.on("mouseout", function(d) {
div.transition()
.duration(500)
.style("opacity", 0);
})
.call(force.drag);
node.append("title")
.text(function(d) { return d.name; });
force.on("tick", function() {
link.attr("x1", function(d) { return d.source.x; })
.attr("y1", function(d) { return d.source.y; })
.attr("x2", function(d) { return d.target.x; })
.attr("y2", function(d) { return d.target.y; });
node.attr("cx", function(d) { return d.x; })
.attr("cy", function(d) { return d.y; })
.attr('style', function(d) { return 'width: ' + (d.group * 2) + 'px; height: ' + (d.group * 2) + 'px; ' + 'left: '+(d.x-(d.group))+'px; ' + 'top: '+(d.y-(d.group))+'px; background: '+color(d.color)+'; box-shadow: 0px 0px 3px #111; box-shadow: 0px 0px 33px '+color(d.color)+', inset 0px 0px 5px rgba(0, 0, 0, 0.2);'})
;
});
</script>"""%(title,width_css, height_css, title_display, meta_display, tooltips_display, title,complex["meta"],self.nr_cubes,self.overlap_perc*100,color_function,complex["meta"],str(self.clusterer),str(self.scaler),width_js,height_js,graph_charge,graph_link_distance,graph_gravity,json.dumps(json_s))
outfile.write(html.encode("utf-8"))
if self.verbose > 0:
print("\nWrote d3.js graph to '%s'"%path_html)