-
Notifications
You must be signed in to change notification settings - Fork 0
/
k_means.py
53 lines (52 loc) · 1.74 KB
/
k_means.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#Based on problem set of Stanford CS221
#http://stanford.edu/~cpiech/cs221/handouts/kmeans.html
def K_means(dataSet, k):
numFeatures = dataSet.shape[1]
def getRandomCentroids(numFeatures, dataSet, k):
centroids_list = []
for x in range(0, k):
center = []
for i in range(0, numFeatures):
center.append(random.randint(dataSet[[i]].min(), dataSet[[i]].max()))
centroids_list.append(center)
return centroids_list
def shouldStop(oldCentroids, centroids, iterations):
if iterations > 10:
return True
return oldCentroids == centroids
def getLabels(dataSet, centroids):
centroids = pd.DataFrame(centroids)
label_dict = {}
for i in range(0, dataSet.shape[0]):
x = dataSet.iloc[i]
x_label = None
x_dis = 100
for j in range(0, centroids.shape[0]):
if distance(x, centroids.iloc[j]) < x_dis:
x_label = j
x_dis = distance(x, j)
label_dict[i] = x_label
return label_dict
def getCentroids(dataSet, labels, k):
label_dict = labels
label_col = np.transpose(pd.DataFrame(label_dict, index = [0]))
labeled_data = pd.concat([dataSet,label_col], axis = 1)
centroid_list = []
for i in range(0,k):
new = labeled_data.loc[labeled_data.iloc[:,numFeatures]== i]
centroid_list.append(new.iloc[:,0:numFeatures].mean().tolist())
return centroid_list
def distance(x, y):
return np.sqrt(sum((x - y) ** 2))
centroids = getRandomCentroids(numFeatures, dataSet, k)
iterations = 0
oldCentroids = None
while not shouldStop(oldCentroids, centroids, iterations):
for x in centroids:
if np.isnan(x).any():
centroids = getRandomCentroids(numFeatures, dataSet, k)
oldCentroids = centroids
iterations += 1
labels = getLabels(dataSet, centroids)
centroids = getCentroids(dataSet, labels, k)
return centroids