-
Notifications
You must be signed in to change notification settings - Fork 2
/
clusters.py
168 lines (140 loc) · 5.57 KB
/
clusters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
'''
module: clusters.py
use: contains functions associated clustering / unsupervised learning
'''
import numpy as np
from kmeans import kplusplus
def symmetrize(array):
'''
Purpose:
Returns the symmetric version of an upper or lower triangular array
Inputs:
array - upper OR lower triangular ndarray
Outputs:
symmetric version of array
'''
return array + array.T - np.diag(array.diagonal())
def getSimilarityArray(feature_array,similarity_method = 'exp',k_nn = 5):
'''
Purpose:
Computes the similarity array for a given feature set, similarity method, and k_nearest_neighbors value
Part of the spectral clustering process
Inputs:
feature_array - set of features
similarity_method - method to use for computing the similarity array:
--'exp' computes W[i,j] = exp(-||xi - xj||^2 / 2)
--'norm' computes W[i,j] = ||xi - xj||^2
--'chain' is specifically for the 'chain' generateData type
k_nn - number of nearest neighbors to consider (k_nn=5 means only the top 5 largest similarity values are kept nonzero)
Outputs:
sim_array - symmetric array of similarity strength values
'''
allowed_methods = ['exp','norm','chain']
if similarity_method not in allowed_methods:
print('ERROR: Not a valid similarity_method')
return
else:
sim_array = np.zeros((len(feature_array),len(feature_array)))
i = 0
j = 0
for rowi in feature_array:
for rowj in feature_array:
if i <= j:
difference = (rowi-rowj).T
if similarity_method == 'exp':
sim_array[i,j] = np.exp(-1*((difference.T).dot(difference)))
elif similarity_method == 'norm':
sim_array[i,j] = difference.T.dot(difference)
elif similarity_method == 'chain':
if np.linalg.norm(difference) <= 1.5:
if ((i != int(len(feature_array)/2.)-1) and (j != int(len(feature_array)/2.))):
sim_array[i,j] = 1
if i == j:
sim_array[i,j] = 1
j += 1
i += 1
j = 0
sim_array = sim_array - np.diag(sim_array.diagonal()) #remove diagonal nonzero values
if k_nn != -1:
for rowi in sim_array:
ind = np.argpartition(rowi, -1*k_nn)[(-1*k_nn):]
for i in range(len(rowi)):
if i not in ind:
rowi[i] = 0;
return symmetrize(sim_array)
def getDegreeArray(sim_array): #convert array W into respective Degree array, Dii = sum(i=1 to n) Wij
'''
Purpose:
Computes the Degree array 'D' in the spectral clustering process from the similarity array
Dii = \sum_{i=1}^n Wij, ie the sum of each row of the similarity array
Inputs:
sim_array - Similarity array Wij retrieved from getSimilarityArray()
Outputs:
D - degree array (described in Purpose
'''
D = np.zeros((sim_array.shape[0],sim_array.shape[0]))
for i in range(0,sim_array.shape[0]):
D[i,i] = np.sum(sim_array[i,:])
return D
def getLaplacian(W,D):
'''
Purpose:
Returns the Laplacian of the similarity array W and the degree array D
For use with spectral clustering
Inputs:
W - similarity array from getSimilarityArray()
D - degree array from getDegreeArray
Outputs:
L = D-W, the laplacian
'''
return D-W
def getLaplacianBasis(features,similarity_method='exp',k_nn=5, get_W=True):
'''
Purpose:
Returns orthogonal basis for Laplacian embedding of features. Essentially the full spectral clustering algorithm before the actual clustering
Inputs:
features - n examples by k features ndarray (n>k preferred)
similarity_method - method to use for computing the similarity array:
--'exp' computes W[i,j] = exp(-||xi - xj||^2 / 2)
--'norm' computes W[i,j] = ||xi - xj||^2
--'chain' is specifically for the 'chain' generateData type
k_nn - number of nearest neighbors to consider in similarity array
num_clusters - number of clusters for kmeans++ to sort the data into
get_W - if get_W is False, then features is actually the similarity matrix W, and thus a new W will not be computed according to similarity_method
Outputs:
U - orthogonal basis returned by the svd of the laplacian with columns corresponding to the most significant singular values at the lowest indices
'''
if get_W:
W = getSimilarityArray(features,similarity_method,k_nn)
else:
W = features
D = getDegreeArray(W)
L = getLaplacian(W,D)
U,s,V = np.linalg.svd(L,full_matrices=0)
return U
def spectralClustering(features,similarity_method='exp',k_nn=5,basis_dim=2,num_clusters=2,get_W=True):
'''
Purpose:
Performs spectral clustering into 'num_clusters' clusters on data defined in the ndarray 'features'
Inputs:
features - n examples by k features ndarray (n>k preferred)
similarity_method - method to use for computing the similarity array:
--'exp' computes W[i,j] = exp(-||xi - xj||^2 / 2)
--'norm' computes W[i,j] = ||xi - xj||^2
--'chain' is specifically for the 'chain' generateData type
k_nn - number of nearest neighbors to consider in similarity array
basis_dim - number of svd basis vectors to consider for input to kmeans++ algorithm
num_clusters - number of clusters for kmeans++ to sort the data into
get_W - if get_W is False, then features is actually the similarity matrix W, and thus a new W will not be computed according to similarity_method
Outputs:
labels - 1 by n array of assigned cluster labels for each feature example
centers - cluster centers array (basis_dim by num_clusters) representing each of the k cluster centers
'''
#W = getSimilarityArray(features,similarity_method,k_nn)
#D = getDegreeArray(W)
#L = getLaplacian(W,D)
#U,s,V = np.linalg.svd(L,full_matrices=0)
U = getLaplacianBasis(features=features,similarity_method=similarity_method,k_nn=k_nn,get_W=get_W)
U = U[:,-1*basis_dim:]
labels, centers = kplusplus(U.T,num_clusters)
return labels, centers, U