-
Notifications
You must be signed in to change notification settings - Fork 0
/
modules 3 week 6 hw1 clustering.txt
182 lines (144 loc) · 6.94 KB
/
modules 3 week 6 hw1 clustering.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
"""
Cluster class for Module 3
"""
import math
import alg_cluster
#####################################################
# Code for closest pairs of clusters
def pair_distance(cluster_list, idx1, idx2):
"""
Helper function that computes Euclidean distance between two clusters in a list
Input: cluster_list is list of clusters, idx1 and idx2 are integer indices for two clusters
Output: tuple (dist, idx1, idx2) where dist is distance between
cluster_list[idx1] and cluster_list[idx2]
"""
return (cluster_list[idx1].distance(cluster_list[idx2]), min(idx1, idx2), max(idx1, idx2))
def slow_closest_pair(cluster_list):
"""
Compute the distance between the closest pair of clusters in a list (slow)
Input: cluster_list is the list of clusters
Output: tuple of the form (dist, idx1, idx2) where the centers of the clusters
cluster_list[idx1] and cluster_list[idx2] have minimum distance dist.
"""
idx_1 = -1
idx_2 = -1
distance = tuple([float("inf"), idx_1, idx_2])
for dummy_1 in range(len(cluster_list)):
for dummy_2 in range(dummy_1 + 1,len(cluster_list)):
if pair_distance(cluster_list,dummy_1,dummy_2) < distance:
distance = pair_distance(cluster_list,dummy_1,dummy_2)
return distance
def fast_closest_pair(cluster_list):
"""
Compute the distance between the closest pair of clusters in a list (fast)
Input: cluster_list is list of clusters SORTED such that horizontal positions of their
centers are in ascending order
Output: tuple of the form (dist, idx1, idx2) where the centers of the clusters
cluster_list[idx1] and cluster_list[idx2] have minimum distance dist.
"""
length = len(cluster_list)
if length <= 3:
distance = slow_closest_pair(cluster_list)
else:
mid = length/2
cluster1 = cluster_list[0:mid]
cluster2 = cluster_list[mid:length]
distance1 = fast_closest_pair(cluster1)
pdistance2 = fast_closest_pair(cluster2)
distance2 = tuple([list(pdistance2)[0], list(pdistance2)[1] + mid, list(pdistance2)[2] +mid])
distance = min(distance1, distance2)
dstrip = list(distance)[0]
middle = 0.5 * (cluster_list[mid-1].horiz_center() +cluster_list[mid].horiz_center())
distance = min(distance, closest_pair_strip(cluster_list, middle, dstrip))
return distance
def closest_pair_strip(cluster_list, horiz_center, half_width):
"""
Helper function to compute the closest pair of clusters in a vertical strip
Input: cluster_list is a list of clusters produced by fast_closest_pair
horiz_center is the horizontal position of the strip's vertical center line
half_width is the half the width of the strip (i.e; the maximum horizontal distance
that a cluster can lie from the center line)
Output: tuple of the form (dist, idx1, idx2) where the centers of the clusters
cluster_list[idx1] and cluster_list[idx2] lie in the strip and have minimum distance dist.
"""
# get a set s
set1 = list()
for dummy_1 in range(len(cluster_list)):
if horiz_center - half_width <= cluster_list[dummy_1].horiz_center() <= horiz_center + half_width:
set1.append(dummy_1)
# sort the S in nondecreasing order of the vertical (y) coordinates
set1.sort(key = lambda cluster: cluster_list[cluster].vert_center())
klength = len(set1)
idx1 = -1
idx2 = -1
distance = tuple([float("inf"), idx1, idx2])
for dummy_2 in range(klength-1):
for dummy_3 in range( dummy_2 + 1 , min(dummy_2 + 4, klength)) :
distance = min(distance, pair_distance(cluster_list, set1[dummy_2], set1[dummy_3]))
return distance
######################################################################
# Code for hierarchical clustering
def hierarchical_clustering(cluster_list, num_clusters):
"""
Compute a hierarchical clustering of a set of clusters
Note: the function may mutate cluster_list
Input: List of clusters, integer number of clusters
Output: List of clusters whose length is num_clusters
"""
new_cluster_list = cluster_list[:]
new_cluster_list.sort(key = lambda cluster: cluster.horiz_center())
while len(new_cluster_list) > num_clusters:
distance = fast_closest_pair(new_cluster_list)
idx1 = list(distance)[1]
idx2 = list(distance)[2]
new_cluster_list[idx1].merge_clusters(new_cluster_list[idx2])
new_cluster_list.pop(idx2)
return new_cluster_list
######################################################################
# Code for k-means clustering
def center_cluster_distance(center, cluster):
"""
Compute distance of center to the point
"""
vert_dist = list(center)[1] - cluster.vert_center()
horiz_dist = list(center)[0] - cluster.horiz_center()
return math.sqrt(vert_dist ** 2 + horiz_dist ** 2)
def nearest_center(center_list, cluster):
"""
Compute nearest center
"""
# initial index and distance
idx1 = -1
distance = float("inf")
for dummy in range(len(center_list)):
if center_cluster_distance(center_list[dummy], cluster) < distance:
distance = center_cluster_distance(center_list[dummy], cluster)
idx1 = dummy
return idx1
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
"""
Compute the k-means clustering of a set of clusters
Note: the function may not mutate cluster_list
Input: List of clusters, integers number of clusters and number of iterations
Output: List of clusters whose length is num_clusters
"""
# position initial centers at the location of clusters with largest populations
new_cluster_list = cluster_list[:]
new_cluster_list.sort(key = lambda cluster: cluster.total_population())
center_list = list()
for dummy in range(num_clusters):
center_list.append(tuple([new_cluster_list[-dummy-1].horiz_center(),new_cluster_list[-dummy-1].vert_center()]))
# start the q iteration of k-means method
for dummy_q in range(num_iterations):
#initial k empty clusters
clustering_results = list()
for dummy_k in range(num_clusters):
clustering_results.append(alg_cluster.Cluster(set([]),0,0,0,0))
#add each of n points to the cluster with the cloest center
for cluster in new_cluster_list:
idx = nearest_center(center_list, cluster)
clustering_results[idx].merge_clusters(cluster)
#compute k new centers
for dummy_k in range(num_clusters):
center_list[dummy_k] = tuple(list([clustering_results[dummy_k].horiz_center(),clustering_results[dummy_k].vert_center()]))
return clustering_results