forked from noameshed/novelty-detection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_relation_classifier.py
314 lines (268 loc) · 11.2 KB
/
train_relation_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
# This code trains a classifier to distinguish classes that are in, not in, have relatives
# in, or have a parent in imagenet. The feature vectors are the frequencies of the top n
# labels of a particular class
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from tqdm import tqdm
def get_feature_vec_top_labels(data, query, n):
'''
Creates and returns a feature vector of length n
The feature vector is based on the distribution of top labels for a class
If there are not n labels, then zeros are appended to the front of the feature vector
to make it length n
'''
grp = query['Biological Group']
name = query['Class']
counts = data[grp][name]['counts']
confs = data[grp][name]['confs']
# Feature vector of frequencies:
feat_vec = counts # Create feature vector
if feat_vec is None:
return None
feat_vec = np.array(feat_vec)
feat_vec /= sum(feat_vec) # convert count to frequency
feat_vec = feat_vec * confs # Multiply frequencies by confidence levels
if len(feat_vec) < n:
feat_vec = np.concatenate((np.zeros(n-len(feat_vec)), feat_vec))
assert(len(feat_vec[-n:])==n)
return list(feat_vec[-n:])
def plot_confusion_matrix(y_true, y_pred, classes,
normalize=False,
title=None,
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if not title:
if normalize:
title = 'Normalized confusion matrix'
else:
title = 'Confusion matrix, without normalization'
# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)
# Only use the labels that appear in the data
# classes = classes[unique_labels(y_true, y_pred)]
classes = np.unique(y_true)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=classes, yticklabels=classes,
title=title,
ylabel='True label',
xlabel='Predicted label')
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], fmt),
ha="center", va="center",
color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
plt.show()
return ax
def split_data(X, Y, XClasses):
'''
Splits the data into training and testing data and labels by class
X: The matrix of confidence values, nx1000 where n is the number of images
Y: The label for each row of X, size nx1
XClasses: The class indices for rows of X, size nx1
Returns:
X_train: Confidence vectors for training data
X_test: Confidence vectors for testing data
Y_train: labels of training data
Y_test: labels of testing data
train_imgs: list of names of training images
test_imgs: list of names of testing images
'''
print(np.array(X).shape, np.array(Y).shape, np.array(XClasses).shape)
classIDs = np.unique(XClasses) # Use to split data by class instead of by image
# Split the data
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42) # Split by image
X_train_ids, X_test_ids, Y_train_ids, Y_test_ids = train_test_split(classIDs, classIDs, test_size=0.4, random_state=42) # Split by class
X_train = []
X_test = []
Y_train = []
Y_test = []
train_imgs = []
test_imgs = []
for idx in X_train_ids: # For each class, save the features in X and labels in Y
im_idxs = np.where(XClasses==idx)
for i in im_idxs[0]:
X_train.append(X[i])
Y_train.append(Y[i])
# train_imgs.append(img_names[i])
for idx in X_test_ids: # For each class, save the features in X and labels in Y
im_idxs = np.where(XClasses==idx)
for i in im_idxs[0]:
X_test.append(X[i])
Y_test.append(Y[i])
# test_imgs.append(img_names[i])
# print(len(X_train), len(X_train[0]), len(Y_train))
# assert(len(test_imgs) == len(X_test) and len(train_imgs) == len(X_train))
return X_train, X_test, Y_train, Y_test#, train_imgs, test_imgs
def combine_to_superclasses(X, X_classes, imagenet_labels):
'''
TODO: Make a mapping of imagenet classes to their superclasses
Non-animals should be in 'other' class
Compare to the alphabetized 'labels' from the data file ['labels']
Combine the vector rows in X to be the length of the number of superclasses
'''
# Create a mapping from the imagenet classes to their superclasses and vice versa
#sclass_map maps the imagenet classes into the superclasses, i.e. 'German Shepard' -> dog
sclass_map = {} # From subclass to superclass (1-1)
sclass_map_inv = {} # From superclass to subclasses (1-many)
df = pd.read_csv('imagenet_superclasses.csv')
i = 0
for index, row in df.iterrows():
i += 1
imagenet = row['Imagenet Class']
sclass = row['Superclass']
if isinstance(sclass, float):
sclass = 'manmade'
# Add to dictionaries
sclass_map[imagenet] = sclass
if sclass not in sclass_map_inv.keys():
sclass_map_inv[sclass] = []
sclass_map_inv[sclass].append(imagenet)
assert(len(imagenet_labels)==len(sclass_map.keys()))
# Create the new label scheme, sclass_labels, to replace imagenet_labels
sclass_labels = np.unique(list(sclass_map_inv.keys())) # length 80 as of 11/23/19
print(len(sclass_labels), 'unique superclass labels')
# Shorten confidence vectors by averaging values by superclass
ims, _ = X.shape
Xnew = np.zeros((ims, len(sclass_labels)))
for i, row in enumerate(X):
row_subclasses = X_classes[i]
row_superclasses = np.array([sclass_map[s] for s in row_subclasses])
# Go through each superclass label and average the subclass values
for j, sc in enumerate(sclass_labels):
idxs = np.where(row_superclasses==sc)[0]
Xnew[i, j] = np.average(row[idxs])
return Xnew
if __name__ == '__main__':
# Load dataframe of inaturalist annotations
df = pd.read_csv('in_out_class.csv')
species = 'Aves'
save_path = None #'C:/Users/noam_/Documents/Cornell/CS7999/11_25_19/resnet18_80_class_imagenet/' # Set to None to prevent saving
labeled_data = df[df['Biological Group']==species]
labeled_data = labeled_data[labeled_data['Annotator'].notnull()]
# Collect feature vectors and labels
n = 20
X = []
X_classes = []
Y_imagenet_relation = []
Y_table_index = []
img_names = []
l_sorted = []
### Uses distribution of top n labels as feature vectors
with open('alexnet_inat_results/inat_results_top_choice.json', 'r') as f:
f = json.load(f)
for i in labeled_data.index:
row = df.iloc[i]
vec = get_feature_vec_top_labels(f, row, n)
if vec is None:
continue
X.append(list(vec))
Y_imagenet_relation.append(row['Relation to Imagenet'])
Y_table_index.append(i)
"""
### Uses confidence vectors as feature vectors - looks at separate files, not large json
for i in labeled_data.index:
row = df.iloc[i]
grp = row['Biological Group']
name = row['Class']
filename = os.getcwd() + '/alexnet_inat_results_bbox/' + grp + '/' + name + '.json'
with open(filename, 'r') as f:
f = json.load(f)
for im in tqdm(f.keys()): # Loop through the images in the file
# Query the vector of labels and confidence levels for each image test
try:
l = f[im]['labels']
c = f[im]['confs']
except:
print('No label for ', filename, im)
# Sort according to the label name (alphabetical)
to_sort = np.argsort(l)
l_sorted = [l[i] for i in to_sort]
c_sorted = [c[i] for i in to_sort]
# Add to training data
X.append(list(c_sorted))
X_classes.append(list(l_sorted))
# X.append([max(c_sorted)]) # Only train on top confidence value
Y_imagenet_relation.append(row['Relation to Imagenet'])
Y_table_index.append(i)
img_names.append(im)
"""
X = np.array(X)
Y_imagenet_relation = np.array(Y_imagenet_relation)
# print(X.shape, Y_imagenet_relation.shape, np.array(X_classes).shape)
# Combine vectors (length 1000) into superclasses vectors (length 80)
# X = combine_to_superclasses(X, X_classes, l_sorted)
# Split data
print('DATA SIZES', len(X), len(Y_imagenet_relation))
X_train, X_test, Y_train, Y_test = split_data(X, Y_imagenet_relation, Y_table_index)
# X_train, X_test, Y_train, Y_test, train_imgs, test_imgs = split_data(X, Y_imagenet_relation, Y_table_index)
# Subsample the training data so we are training on equal numbers for each category
in_idxs = np.argwhere(np.array(Y_train)=='in imagenet')
notin_idxs = np.random.choice(np.argwhere(np.array(Y_train)=='not in imagenet').flatten(), len(in_idxs), replace=False)
par_idxs = np.random.choice(np.argwhere(np.array(Y_train)=='parent in imagenet').flatten(), len(in_idxs), replace=False)
rel_idxs = np.random.choice(np.argwhere(np.array(Y_train)=='relative in imagenet').flatten(), len(in_idxs), replace=False)
samples = len(in_idxs)
print('TRAINING IMS IN IMAGENET',samples)
# Combine all indices for training subset
idxs = np.sort(np.concatenate((in_idxs.flatten(), notin_idxs.flatten(), par_idxs.flatten(), rel_idxs.flatten())))
X_train_subset = [X_train[i] for i in idxs]
Y_train_subset = [Y_train[i] for i in idxs]
# Train linear classifier
clf_svc = SVC(tol=1e-3, random_state=True, class_weight='balanced')
clf_svc.fit(X_train_subset, Y_train_subset)
preds_svc = clf_svc.predict(X_test)
print('SVM:', clf_svc.score(X_test, Y_test))
# Save SVM results:
# if save_path is not None:
# with open(save_path+species+'_svm_results.txt', 'w') as f:
# f.write('ImageID\t Actual\t Prediction\n')
# for i, p in enumerate(preds_svc):
# line = test_imgs[i]+'\t'+Y_test[i]+'\t'+p+'\n'
# f.write(line)
# # Train Random Forest Classifier
clf_rf = RandomForestClassifier(n_estimators=100,class_weight='balanced')
clf_rf.fit(X_train, Y_train)
preds_rf = clf_rf.predict(X_test)
print('Random Forest:', clf_rf.score(X_test, Y_test))
# # Save RF results:
# if save_path is not None:
# with open(save_path+species+'_rf_results_.txt', 'w') as f:
# f.write('ImageID\t Actual\t Prediction\n')
# for i, p in enumerate(preds_rf):
# line = test_imgs[i]+'\t'+Y_test[i]+'\t'+p+'\n'
# f.write(line)
# Plot confusion matrices
classes = ['relative in imagenet', 'in imagenet', 'parent in imagenet', 'not in imagenet']
title = 'ImageNet Relationship Classifier (SVM) - Aves'
plot_confusion_matrix(Y_test, preds_svc, classes, normalize=True, title=title)
title = 'ImageNet Relationship Classifier (RF) - Aves'
plot_confusion_matrix(Y_test, preds_rf, classes, normalize=True, title=title)