forked from LukasKristensen/ce8-semester-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain-svm.py
207 lines (168 loc) · 7.93 KB
/
train-svm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# %%
import numpy as np
from sklearn.svm import SVC
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import joblib
import os
# Make SVM folder if it doesn't exist
if not os.path.exists('SVM'):
os.makedirs('SVM')
print("Created SVM folder")
# Possible spectrogram types are "stft", "mel", "mfcc", and "pncc"
spectrogram_type = "stft"
classes_get = ['JUMP','BOAT','SEAGUL_SCREAM','BRIDGE','SCOOTER','PEE','OBJECT_SPLASH','UFO','IDLE_MOTOR','SEAGUL_SPLASH','VOICE', 'SWIM']
FULL_TRAIN_DATASET_PATH = 'data/full_data/'+spectrogram_type+'/train_data.npy'
FULL_TRAIN_LABEL_PATH = 'data/full_data/'+spectrogram_type+'/train_labels.npy'
FULL_VAL_DATASET_PATH = 'data/full_data/'+spectrogram_type+'/val_data.npy'
FULL_VAL_LABEL_PATH = 'data/full_data/'+spectrogram_type+'/val_labels.npy'
data_np_one_hot = np.load(FULL_TRAIN_DATASET_PATH)
data_label_object = np.load(FULL_TRAIN_LABEL_PATH, allow_pickle=True)
data_labels = []
data_np = []
classes_get_np = np.asarray(classes_get)
# the LDA expects a single label for each sample, so we need to convert the one-hot encoded labels to single labels
# If we meet a sample with multiple labels, we make multiple copies of the sample, one for each label
for data, labels in zip(data_np_one_hot, data_label_object):
class_indexes = np.where(labels.label == 1)[0]
get_classes = np.take(classes_get_np, class_indexes)
#print(get_classes)
for class_index in class_indexes:
data_labels.append(class_index)
data_np.append(data)
data_np = np.asarray(data_np)
train_labels = np.asarray(data_labels)
#val_np = np.load(FULL_VAL_DATASET_PATH)
#val_labels = np.load(FULL_VAL_LABEL_PATH, allow_pickle=True)
val_data_np_one_hot = np.load(FULL_VAL_DATASET_PATH)
val_data_label_object = np.load(FULL_VAL_LABEL_PATH, allow_pickle=True)
val_data_labels = []
val_data_np = []
classes_get_np = np.asarray(classes_get)
for data, labels in zip(val_data_np_one_hot, val_data_label_object):
class_indexes = np.where(labels.label == 1)[0]
get_classes = np.take(classes_get_np, class_indexes)
#print(get_classes)
for class_index in class_indexes:
val_data_labels.append(class_index)
val_data_np.append(data)
val_np = np.asarray(val_data_np)
val_labels = np.asarray(val_data_labels)
val_classes = np.unique(val_labels)
train_spectrograms = np.reshape(data_np, (len(data_np), -1))
val_spectrograms = np.reshape(val_np, (len(val_np), -1))
print('Spectrogram shape:', train_spectrograms.shape)
print('Labels shape:', train_labels.shape)
#print('Spectrogram dtype:', spectrograms.dtype)
#print(labels[3])
print('Training samples:', train_spectrograms.shape[0])
print('Validation samples:', val_spectrograms.shape[0])
#print('Test samples:', test_spectrograms.shape[0])
# Print the distribution of labels across the two datasets
unique_train_labels, train_counts = np.unique(train_labels, return_counts=True)
unique_val_labels, val_counts = np.unique(val_labels, return_counts=True)
print('Training distribution:')
for label, count in zip(unique_train_labels, train_counts):
print(f' - {label}: {count}')
print('Validation distribution:')
for label, count in zip(unique_val_labels, val_counts):
print(f' - {label}: {count}')
print('Spectrogram Shape:', train_spectrograms.shape)
print("Shape data_set_train:", train_spectrograms.shape)
print("Shape data_set_val:", val_spectrograms.shape)
# %%
dmr = True
components = 2
# Implement LDA to reduce dimensionality
if dmr:
#lda = LDA(n_components=components)
#lda.fit(train_spectrograms, train_labels)
lda = joblib.load("LDA/lda_model_"+spectrogram_type+".pkl")
#joblib.dump(fitted_LDA, 'LDA/lda_model.pkl')
train_data = lda.transform(train_spectrograms)
val_data = lda.transform(val_spectrograms)
print("Shape data_set_train:", train_data.shape)
print("Shape data_set_val:", val_data.shape)
#Reshape to 2d array of 3rd and 4th component
#train_data = train_data[:, 2:4]
#val_data = val_data[:, 2:4]
print("Shape data_set_train:", train_data.shape)
print("Shape data_set_val:", val_data.shape)
# %%
### Parameter Sweep 1 ###
kernels = ['Polynomial', 'RBF', 'Sigmoid','Linear']
#A function which returns the corresponding SVC model
def getClassifier(ktype):
if ktype == 0:
# Polynomial kernal
return SVC(kernel='poly', degree=8, gamma="auto")
elif ktype == 1:
# Radial Basis Function kernal
return SVC(kernel='rbf', gamma="auto")
elif ktype == 2:
# Sigmoid kernal
return SVC(kernel='sigmoid', gamma="auto")
elif ktype == 3:
# Linear kernal
return SVC(kernel='linear', gamma="auto")
for i in range(4):
if os.path.isfile('SVM/model_'+spectrogram_type+'_'+kernels[i]+'.pkl'):
svclassifier = joblib.load('SVM/model_'+spectrogram_type+'_'+kernels[i]+'.pkl')
else:
svclassifier = getClassifier(i)
svclassifier.fit(train_data, train_labels)# Make prediction
joblib.dump(svclassifier, 'SVM/model_'+spectrogram_type+'_'+kernels[i]+'.pkl')
x_pred = svclassifier.predict(train_data)# Evaluate our model
y_pred = svclassifier.predict(val_data)# Evaluate our model
print("Evaluation:", kernels[i], "kernel")
print(classification_report(val_labels,y_pred))
# Plot confusion matrix of training data using metrics.ConfusionMatrixDisplay, with labels
cm = metrics.ConfusionMatrixDisplay(metrics.confusion_matrix(train_labels, x_pred), display_labels=classes_get)
fig, ax = plt.subplots(figsize=(10,10))
plt.subplots_adjust(left=0.2)
cm.plot(ax=ax, xticks_rotation='vertical')
cm.ax_.set_title('Confusion matrix of the classifier : \n '+kernels[i]+' kernel on training data')
plt.savefig('SVM/confusion_matrix_'+spectrogram_type+'_'+kernels[i]+'_train.png')
# Put labels on the confusion matrix
#plt.show()
# Plot confusion matrix of validation data using metrics.ConfusionMatrixDisplay, with labels
cm = metrics.ConfusionMatrixDisplay(metrics.confusion_matrix(val_labels, y_pred), display_labels=classes_get)
fig, ax = plt.subplots(figsize=(10,10))
plt.subplots_adjust(left=0.2)
cm.plot(ax=ax, xticks_rotation='vertical')
cm.ax_.set_title('Confusion matrix of the classifier : \n '+kernels[i]+' kernel on validation data')
plt.savefig('SVM/confusion_matrix_'+spectrogram_type+'_'+kernels[i]+'_val.png')
#plt.show()
# %%
### Parameter Sweep 2 ###
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
if os.path.isfile('SVM/grid_model_'+spectrogram_type+'.pkl'):
grid = joblib.load('SVM/grid_model_'+spectrogram_type+'.pkl')
else:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(train_data, train_labels)
# Save grid model
joblib.dump(grid, 'SVM/grid_model_'+spectrogram_type+'.pkl')
print(grid.best_estimator_)
# Save best estimator to txt file
with open('SVM/best_estimator_'+spectrogram_type+'.txt', 'w') as f:
f.write(str(grid.best_estimator_))
f.close()
# Make prediction
grid_predictions = grid.predict(val_data)
print(confusion_matrix(val_labels,grid_predictions))
cm = metrics.ConfusionMatrixDisplay(metrics.confusion_matrix(val_labels, grid_predictions), display_labels=classes_get)
fig, ax = plt.subplots(figsize=(10,10))
plt.subplots_adjust(left=0.2)
cm.plot(ax=ax, xticks_rotation='vertical')
cm.ax_.set_title('Confusion matrix of the classifier with best estimator : \n '+str(grid.best_estimator_)+ ' on validation data')
plt.savefig('SVM/confusion_matrix_'+spectrogram_type+'_best_estimator.png')
#plt.show()
print(classification_report(val_labels ,grid_predictions))
# Sace classification report to txt file
with open('SVM/classification_report_'+spectrogram_type+'_best_estimator.txt', 'w') as f:
f.write(classification_report(val_labels ,grid_predictions))
f.close()