forked from letiantian/kmedoids
-
Notifications
You must be signed in to change notification settings - Fork 1
/
kmedoids.py
97 lines (92 loc) · 3.33 KB
/
kmedoids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import numpy as np
import random
def kMedoids(D, k, tmax=100, init_Ms="random", n_iso=None):
# determine dimensions of distance matrix D
m, n = D.shape
# Assert square distance matrix
assert n == m
if k > n:
raise Exception('too many medoids')
# randomly initialize an array of k medoid indices
if str(init_Ms) == "random":
M = np.arange(n)
np.random.shuffle(M)
M = np.sort(M[:k])
elif str(init_Ms) == "isolated":
if n_iso is None:
n_iso = k
elif n_iso > k:
n_iso = k
else:
assert type(n_iso) == int
if n_iso == 0:
raise Exception('You must choose a non-zero number of \
isolated medoids or switch to random init.')
summed_distances = np.sum(D, axis = 0)
av_distance = np.sum(D) / float(n)**2
M = np.empty(0, dtype=int)
# Find most isolated sample to use as first medoid
i = np.argmax(summed_distances)
M = np.append(M, i)
# Find new medoids which are furthest away from previous medoids
for i in range(1, n_iso):
summed_distances = np.zeros(n)
for j in range(0, len(M)):
summed_distances += D[:, M[j]]
# avoid repeating medoids
summed_distances[M] = 0
j = np.argmax(summed_distances)
M = np.append(M, j)
# Randomize the rest
for i in range(n_iso, k):
rand = np.random.random_integers(0,n-1)
# Make sure that the random medoid is not already chosen and
# is not too close to another medoid by imposing the inter-
# medoid distance to be larger than the average inter-sample
# distance
tries = 0
while rand in M or any(D[rand,j] < av_distance for j in M):
tries += 1
rand = np.random.random_integers(0,n-1)
if tries == 100:
tries = 0
av_distance *= 0.9
M = np.append(M, rand)
elif len(init_Ms) == k:
M = init_Ms
elif len(init_Ms) < k:
M = init_Ms
for i in range(len(init_Ms), k):
rand = np.random.random_integers(0,n-1)
while rand in M:
rand = np.random.random_integers(0,n-1)
M = np.append(M, rand)
else:
raise Exception("intial medoids vector has the wrong length")
# create a copy of the array of medoid indices
Mnew = np.copy(M)
# initialize a dictionary to represent clusters
C = {}
for t in range(tmax):
# determine clusters, i. e. arrays of data indices
J = np.argmin(D[:,M], axis=1)
J[M] = np.arange(0, k, 1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# update cluster medoids
for kappa in range(k):
J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
j = np.argmin(J)
Mnew[kappa] = C[kappa][j]
np.sort(Mnew)
# check for convergence
if np.array_equal(M, Mnew):
break
M = np.copy(Mnew)
else:
# final update of cluster memberships
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# return results
return M, C