-
Notifications
You must be signed in to change notification settings - Fork 5
/
dissimilarity_common.py
116 lines (98 loc) · 4.48 KB
/
dissimilarity_common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# -*- coding: utf-8 -*-
"""This module implements the computation of the dissimilarity
representation of a set of objects from a set of prototypes given a
distance function. Various prototype selection policies are available.
Copyright (c) 2012-2014, Emanuele Olivetti and Eleftherios Garyfallidis
Distributed under the BSD 3-clause license. See COPYING.txt.
"""
from __future__ import division
import numpy as np
from sys import stdout
try:
from joblib import cpu_count, Parallel, delayed
joblib_available = True
except ImportError:
joblib_available = False
def furthest_first_traversal(S, k, distance, permutation=True):
"""This is the farthest first traversal (fft) algorithm which is
known to be a good sub-optimal solution to the k-center problem.
See for example:
Hochbaum, Dorit S. and Shmoys, David B., A Best Possible Heuristic
for the k-Center Problem, Mathematics of Operations Research, 1985.
or: http://en.wikipedia.org/wiki/Metric_k-center
"""
# do an initial permutation of S, just to be sure that objects in
# S have no special order. Note that this permutation does not
# affect the original S.
if permutation:
idx = np.random.permutation(S.shape[0])
S = S[idx]
else:
idx = np.arange(S.shape[0], dtype=np.int)
T = [0]
while len(T) < k:
z = distance(S, S[T]).min(1).argmax()
T.append(z)
return idx[T]
def subset_furthest_first(S, k, distance, permutation=True, c=2.0):
"""Stochastic scalable version of the fft algorithm based in a
random subset of a specific size.
See: E. Olivetti, T.B. Nguyen, E. Garyfallidis, The Approximation
of the Dissimilarity Projection, Proceedings of the 2012
International Workshop on Pattern Recognition in NeuroImaging
(PRNI), vol., no., pp.85,88, 2-4 July 2012 doi:
10.1109/PRNI.2012.13
D. Turnbull and C. Elkan, Fast Recognition of Musical Genres
Using RBF Networks, IEEE Trans Knowl Data Eng, vol. 2005, no. 4,
pp. 580-584, 17.
"""
size = int(max(1, np.ceil(c * k * np.log(k))))
if permutation:
idx = np.random.permutation(S.shape[0])[:size]
else:
idx = range(size)
# note: no need to add extra permutation here below:
return idx[furthest_first_traversal(S[idx], k, distance, permutation=False)]
def compute_dissimilarity(data, distance, prototype_policy, num_prototypes, verbose=False, size_limit=5000000, n_jobs=None):
"""Compute dissimilarity matrix given data, distance,
prototype_policy and number of prototypes.
"""
print "Computing dissimilarity data for the original data:",
data_original = data
num_proto = num_prototypes
if data.shape[0] > size_limit:
print
print "Datset too big: subsampling to %s entries only!" % size_limit
data = data[np.random.permutation(data.shape[0])[:size_limit], :]
print prototype_policy
print "number of prototypes:", num_proto
stdout.flush()
if verbose: print("Generating %s prototypes as" % num_proto),
# Note that we use the original dataset here, not the subsampled one!
if prototype_policy=='random':
if verbose: print("random subset of the initial data.")
prototype_idx = np.random.permutation(data_original.shape[0])[:num_proto]
prototype = [data_original[i] for i in prototype_idx]
elif prototype_policy=='fft':
prototype_idx = furthest_first_traversal(data_original, num_proto, distance)
prototype = [data_original[i] for i in prototype_idx]
elif prototype_policy=='sff':
prototype_idx = subset_furthest_first(data_original, num_proto, distance)
prototype = [data_original[i] for i in prototype_idx]
else:
raise Exception
if verbose: print("Computing dissimilarity matrix.")
if joblib_available and n_jobs != 1:
if n_jobs is None or n_jobs == -1:
n_jobs = cpu_count()
print("Parallel computation of the dissimilarity matrix: %s cpus." % n_jobs)
if n_jobs > 1:
tmp = np.linspace(0, data.shape[0], n_jobs).astype(np.int)
else: # corner case: joblib detected 1 cpu only.
tmp = (0, data.shape[0])
chunks = zip(tmp[:-1], tmp[1:])
data_dissimilarity = np.vstack(Parallel(n_jobs=n_jobs)(delayed(distance)(data[start:stop], prototype) for start, stop in chunks))
else:
data_dissimilarity = distance(data, prototype)
print
return data_dissimilarity