-
Notifications
You must be signed in to change notification settings - Fork 93
/
Copy pathfaiss_kmeans.py
79 lines (67 loc) · 2.49 KB
/
faiss_kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import numpy as np
import time
import faiss
from source import eval_cluster
import preprocess
def run_kmeans(x, nmb_clusters, verbose=False):
"""Runs kmeans on 1 GPU.
Args:
x: data
nmb_clusters (int): number of clusters
Returns:
list: ids of data in each cluster
"""
n_data, d = x.shape
# faiss implementation of k-means
clus = faiss.Clustering(d, nmb_clusters)
# Change faiss seed at each k-means so that the randomly picked
# initialization centroids do not correspond to the same feature ids
# from an epoch to another.
clus.seed = np.random.randint(1234)
clus.niter = 20
clus.max_points_per_centroid = 10000000
res = faiss.StandardGpuResources()
flat_config = faiss.GpuIndexFlatConfig()
flat_config.useFloat16 = False
flat_config.device = 0
index = faiss.GpuIndexFlatL2(res, d, flat_config)
# perform the training
clus.train(x, index)
_, I = index.search(x, 1)
losses = faiss.vector_to_array(clus.obj)
if verbose:
print('k-means loss evolution: {0}'.format(losses))
return [int(n[0]) for n in I], losses[-1]
class Kmeans:
def __init__(self, k):
self.k = k
def cluster(self, feat, verbose=False):
"""Performs k-means clustering.
Args:
x_data (np.array N * dim): data to cluster
"""
end = time.time()
# PCA-reducing, whitening and L2-normalization
xb = preprocess.preprocess_features(feat)
# cluster the data
I, loss = run_kmeans(xb, self.k, verbose)
self.labels = I
if verbose:
print('k-means time: {0:.0f} s'.format(time.time() - end))
return loss
if __name__ == "__main__":
num_classes = 1000
deepcluster = Kmeans(num_classes)
print("loading features ...")
features = np.fromfile('data/unlabeled/imgnet_rotation/features/res50_rotation.bin',
dtype=np.float32).reshape(1281167, -1)
with open('data/unlabeled/imgnet_rotation/meta.txt', 'r') as f:
lines = f.readlines()
labels = np.array([int(l.strip()) for l in lines])
print("clustering ...")
clustering_loss = deepcluster.cluster(features, verbose=True)
pred = np.array(deepcluster.labels)
hist = np.bincount(pred, minlength=num_classes)
minimal_cls_size, maximal_cls_size = hist.min(), hist.max()
prec, rec, fscore = eval_cluster.fscore(labels, pred)
print("prec: {:.5g}, rec: {:.5g}, fscore: {:.5g}".format(prec, rec, fscore))