-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathModels.py
500 lines (426 loc) · 22.4 KB
/
Models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
import warnings
warnings.filterwarnings("ignore")
from sklearn import tree
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree.export import export_text
from sklearn.ensemble import AdaBoostClassifier
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import os
from skrules import SkopeRules
from scipy import stats
import matplotlib.pyplot as plt
import time
from sklearn.utils import shuffle
start = time.time()
train_dath_path = 'census-income.data/train_data'
test_data_path = 'census-income.test/test_data'
feature_names = ["Age", "ClassNotEmployed", "ClassPrivate", "ClassSelfEmployed", "ClassLocalGovernment",
"ClassStateGovernment", "ClassFederalGovernment", "IndustryCode",
"OccupationCode", "LessThanHighSchool:", "College", "Bachelors", "Masters", "ProfDegree", "Doctorate",
"Wage", "EnrolledEducation - Not in School", "EnrolledHighSchool", "EnrolledCollegeOrUniversity",
"IsNotMarried", "IsMarried", "IsDivorced",
"IsWidowed", "RaceAsian", "RaceWhite", "RaceOther",
"RaceAmericanIndian", "RaceBlack", "SexMale", "EmploymentStatusNotEmployed",
"EmploymentStatusPartTime", "EmploymentStatusFullTime",
"CapitalGains", "CapitalLosses", "StockDividends", "HeadOfHousehold", "JointFiler", "SingleFiler",
"NonFiler", "HasAmericanParent", "IsAmericanBorn", "WeeksWorked"]
# Read the data of a file into an array
def read_file_into_array(file_path):
file_array = []
file = open(file_path)
lines = file.read().split("\n")
for line in lines:
if len(line) > 1:
temp = []
string_vals = line.split(', ')
for i in range(0, len(string_vals)):
temp.append(float(string_vals[i])) # Convert the string values to floats for processing
file_array.append(temp)
file.close()
return file_array # remove the last entry as it is just white-space
# Read the pre-encoded data into arrays
train_data = read_file_into_array(train_dath_path)
test_data = read_file_into_array(test_data_path)
# Remove a feature at the specified index
def remove_feature(data_array, index):
return_array = []
for i in range(0, len(data_array)):
temp = []
for j in range(0, len(data_array[0])):
if j != index:
temp.append(data_array[i][j])
return_array.append(temp)
return return_array
# Remove the weights from the training and data set
train_data = remove_feature(train_data, 39)
test_data = remove_feature(test_data, 39)
# Separate the class from the features
def separate_features_from_class(data_array):
feature_array = []
class_array = []
for i in range(0, len(data_array)):
feature_temp = []
class_temp = []
for j in range(0, len(data_array[0])):
if j != (len(data_array[0]) - 1):
feature_temp.append(data_array[i][j])
else:
class_temp.append(data_array[i][j])
feature_array.append(feature_temp)
class_array.append(class_temp)
return feature_array, class_array
# Print the number of majority and minority classes in a data set
def print_class_distribution(data, data_type):
num_minority = 0
num_majority = 0
for i in range(0, len(data)):
if data[i] == 0.0:
num_majority = num_majority + 1
else:
num_minority = num_minority + 1
print("The number of majority class instances in the " + data_type + " set is " + str(num_majority))
print("The number of minority class instances in the " + data_type + " set is " + str(num_minority))
print("The total is " + str(num_minority + num_majority))
return num_majority, num_minority
# Get the training data and convert it to the appropriate format
X_train, Y_train = separate_features_from_class(train_data)
X_train = np.array(X_train)
Y_train = np.array(Y_train)
Y_train = Y_train.ravel()
# Get the test data and convert it to the appropriate format
X_test, Y_test = separate_features_from_class(test_data)
X_test = np.array(X_test)
Y_test = np.array(Y_test)
Y_test = Y_test.ravel()
# Print the class distributions of the training and testing set before oversampling
print_class_distribution(Y_train, "training")
print_class_distribution(Y_test, "testing")
# Oversample the Minority Class
ros = RandomOverSampler(random_state=0, ratio={1: 20000, 0: 140529})
X_train, Y_train = ros.fit_resample(X_train, Y_train)
# Undersample the majority Class
ros = RandomUnderSampler(random_state=0, ratio={1: 20000, 0: 50000})
X_train, Y_train = ros.fit_resample(X_train, Y_train)
X_train, Y_train = shuffle(X_train, Y_train)
# Get validation data which is roughly 10% the size of the original training data
X_val = X_train[0:int(0.8 * len(X_train))]
Y_val = Y_train[0:int(0.8 * len(X_train))]
X_test_val = X_train[int(0.8 * len(X_train)):]
Y_test_val = Y_train[int(0.8 * len(X_train)):]
# Print the class distributions of the training and testing set after oversampling
print("After resampling")
print_class_distribution(Y_train, "training")
print_class_distribution(Y_test_val, "validation")
# Helper function that prints statistics for predictions from the models
# Prints accuracy, recall, f1 score, precision, and the confusion matrix
def print_statistics(model_name, y_pred, y_true):
accuracy = accuracy_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
print("\nThe accuracy for the model " + model_name + " is {0:.2%}".format(accuracy))
print("The minority class recall for the model " + model_name + " is {0:.2%}".format(recall))
print("The minority class f1 score for the model " + model_name + " is {0:.2%}".format(f1))
print("The minority class precision for the model " + model_name + " is {0:.2%}".format(precision))
print("The confusion matrix is given by: ")
print(confusion_matrix(y_true, y_pred))
return accuracy
# Compute and print the results from cross validation
def print_cross_validation(model, model_name, data, target):
scoring = {'accuracy': 'accuracy',
'recall': 'recall',
'precision': 'precision',
'roc_auc': 'roc_auc'}
scores = cross_validate(model, data, target, cv=10, scoring=scoring)
print("\nPrinting the results per fold for the " + model_name + " model")
for i in range(0, 10):
print("\nFold " + str(i + 1) + ": Accuracy " + " {0:.2%}".format(scores['test_accuracy'][i]))
print("Fold " + str(i + 1) + ": Recall " + " {0:.2%}".format(scores['test_recall'][i]))
print("Fold " + str(i + 1) + ": Precision " + " {0:.2%}".format(scores['test_precision'][i]))
print("Fold " + str(i + 1) + ": ROC area " + " {0:.2%}".format(scores['test_roc_auc'][i]))
return scores
# Helper function to write the decision tree model to a file
def write_rules_to_file(data_array, file_path):
if os.path.exists(file_path):
os.remove(file_path)
with open(file_path, 'w') as f:
f.write(str(data_array))
f.close()
# **************** Decision Tree Section ******************************
# Create three decision trees for validation and ultimately select 1
# First decision tree model
clf_val_gini = tree.DecisionTreeClassifier(max_depth=8, criterion="gini", min_samples_split=2)
clf_val_gini.fit(X_val, Y_val)
clf_val_gini_predict = clf_val_gini.predict(X_test_val)
print_statistics("Decision Tree - Gini criterion", clf_val_gini_predict, Y_test_val)
# Second decision tree model
clf_val_entropy = tree.DecisionTreeClassifier(max_depth=20, criterion="entropy", min_samples_split=2)
clf_val_entropy.fit(X_val, Y_val)
clf_val_entropy_predict = clf_val_entropy.predict(X_test_val)
print_statistics("Decision Tree - Entropy criterion", clf_val_entropy_predict, Y_test_val)
# Third decision tree model
clf_val_gini2 = tree.DecisionTreeClassifier(criterion="gini")
clf_val_gini2.fit(X_val, Y_val)
clf_val_gini2_predict = clf_val_gini2.predict(X_test_val)
print_statistics("Decision Tree - Gini criterion variant", clf_val_gini2_predict, Y_test_val)
print("In light of these results, the entropy decision tree will be chosen with max depth = 8, and minimum samples to split = 2")
# Next we fit the chosen decision tree classifier with the training data
dt_clf = tree.DecisionTreeClassifier(max_depth=8, criterion="gini", min_samples_split=2)
# Compute the cross validated accuracy scores
dt_clf_cross_validation_scores = print_cross_validation(dt_clf, "Decision Tree - Entropy criterion", X_train, Y_train)
# Train the decision tree and predict the results on the test set, and print the results
dt_clf.fit(X_train, Y_train)
dt_clf_pred = dt_clf.predict(X_test)
print_statistics("Decision Tree - Entropy criterion", dt_clf_pred, Y_test)
# Next we print the tree's rules and export them to a text file
tree_rules = export_text(dt_clf, feature_names=feature_names)
write_rules_to_file(tree_rules, "Tree Rules")
# **************** Nearest Neighbours Section *********************
# Next we will compare 5 nearest neighbour models on the validation set
# First knn model
knn_val_1 = KNeighborsClassifier(n_neighbors=5, weights="distance")
knn_val_1.fit(X_val, Y_val)
knn_val_1_predict = knn_val_1.predict(X_test_val)
print_statistics("Nearest Neighbours - Model 1 ", knn_val_1_predict, Y_test_val)
# Second knn model
knn_val_2 = KNeighborsClassifier(n_neighbors=5, metric="manhattan")
knn_val_2.fit(X_val, Y_val)
knn_val_2_predict = knn_val_2.predict(X_test_val)
print_statistics("Nearest Neighbours - Model 2 ", knn_val_2_predict, Y_test_val)
# Third knn model
knn_val_3 = KNeighborsClassifier(n_neighbors=5, weights="distance", metric="manhattan")
knn_val_3.fit(X_val, Y_val)
knn_val_3_predict = knn_val_3.predict(X_test_val)
print_statistics("Nearest Neighbours - Model 3 ", knn_val_3_predict, Y_test_val)
# Fourth knn model
knn_val_4 = KNeighborsClassifier(n_neighbors=3, weights="distance")
knn_val_4.fit(X_val, Y_val)
knn_val_4_predict = knn_val_4.predict(X_test_val)
print_statistics("Nearest Neighbours - Model 4 ", knn_val_4_predict, Y_test_val)
# Fifth knn model
knn_val_5 = KNeighborsClassifier(n_neighbors=1, metric="manhattan")
knn_val_5.fit(X_val, Y_val)
knn_val_5_predict = knn_val_5.predict(X_test_val)
print_statistics("Nearest Neighbours - Model 5 ", knn_val_5_predict, Y_test_val)
print("\nAs per the results, the first nearest neighbours model with parameters 5 neighbours at a weighted distance, will be chosen")
# Fit the chosen nearest neighbours model to the training data
knn = KNeighborsClassifier(n_neighbors=5, weights="distance")
# Compute cross validation on the nearest neighbour model
knn_cross_validation_scores = print_cross_validation(knn, "Nearest Neighbours", X_train, Y_train)
# Run the nearest neighbour model on the test set and print the stats
knn.fit(X_train, Y_train)
knn_pred = knn.predict(X_test)
print_statistics("Nearest Neighbours", knn_pred, Y_test)
# ************* Linear Model: Neural Network with Semi-Supervised Learning! *******
# Neural network for semi-supervised learning
neural_network = Sequential()
neural_network.add(Dense(8, input_dim=42, activation='relu'))
neural_network.add(Dense(16, activation='relu'))
neural_network.add(Dense(8, activation='relu'))
neural_network.add(Dense(4, activation='relu'))
neural_network.add(Dense(1, activation='sigmoid'))
neural_network.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Get the indices which have a high probability for training the network
# only on high probability unsupervised labellings
def get_indices_of_high_probability(pred_array):
indices = []
count = 0
for i in range(0, len(pred_array)):
if pred_array[i] > 0.60:
count = count + 1
indices.append(i)
elif pred_array[i] < 0.175:
indices.append(i)
return indices
# Helper function to split a list in two
def split_list(a_list):
half = len(a_list) // 2
return a_list[:half], a_list[half:]
# Compute Semi Supervised learning on the network
def compute_semi_supervised_learning(neural_net, training_features, training_classes):
# First we split the array up as needed into 10 parts
feature_array = []
class_array = []
num_partitions = 4
increment_val = int(len(training_features) / num_partitions)
counter = 0
# Split the data into the appropriate number of partitions
for i in range(0, num_partitions):
if i != num_partitions - 1:
feature_array.append(np.array(training_features[counter:counter + increment_val]))
class_array.append(np.array(training_classes[counter:counter + increment_val]))
else:
feature_array.append(np.array(training_features[counter:len(training_features)]))
class_array.append(np.array(training_classes[counter:len(training_features)]))
counter = counter + increment_val
# Train with the first partition (supervised)
neural_net.fit(feature_array[0], class_array[0], epochs=10, batch_size=1000)
for i in range(1, num_partitions):
# Split the arrays sub-arrays dedicated for unsupervised and supervised learning
feature_array_sl, feature_array_ul = split_list(feature_array[i])
class_array_sl, class_array_ul = split_list(class_array[i])
# Train the network on the supervised sub-array
neural_net.fit(feature_array_sl, class_array_sl, epochs=10, batch_size=1000)
# Get the predicted probabilities for the unclassified data
pred_probabilities = np.array(neural_net.predict(feature_array_ul))
pred_indices = get_indices_of_high_probability(pred_probabilities)
pred_labels = neural_net.predict_classes(feature_array_ul)
# Train the network only on unsupervised data that is predicted as 0 or 1 with high probability
neural_net.fit(feature_array_ul[pred_indices], pred_labels[pred_indices], epochs=10, batch_size=1000)
return neural_net
# Train the network, then make predictions on the test set and print the results
neural_network = compute_semi_supervised_learning(neural_network, X_train, Y_train)
neural_network_pred = np.array(neural_network.predict_classes(np.array(X_test)))
print_statistics("Semi-Supervised Neural Network", neural_network_pred, Y_test)
# ************* Rule Model: ************************
# Here we compare 3 nearest neighbour models on the validation set
# First skope rules model
rule_clf1 = SkopeRules(
n_estimators=50,
precision_min=0.2,
recall_min=0.2,
feature_names=feature_names)
rule_clf1.fit(X_val, Y_val)
rule_clf1_ypred = rule_clf1.predict(X_test_val)
print_statistics("Rule Classifier - Skope Rules - Model 1", rule_clf1_ypred, Y_test_val)
# Second skope rules model
rule_clf2 = SkopeRules(n_estimators=50,
precision_min=0.2,
recall_min=0.2,
feature_names=feature_names)
rule_clf2.fit(X_val, Y_val)
rule_clf2_ypred = rule_clf2.predict(X_test_val)
print_statistics("Rule Classifier - Skope Rules - Model 2", rule_clf2_ypred, Y_test_val)
# Third skope rules model
rule_clf3 = SkopeRules(n_estimators=25,
precision_min=0.2,
recall_min=0.2,
feature_names=feature_names)
rule_clf3.fit(X_val, Y_val)
rule_clf3_ypred = rule_clf3.predict(X_test_val)
print_statistics("Rule Classifier - Skope Rules - Model 3", rule_clf3_ypred, Y_test_val)
print("\nAs per the results, Skope Rules model 1 will be chosen for training")
rule_clf = SkopeRules(n_estimators=50,
precision_min=0.2,
recall_min=0.2,
feature_names=feature_names)
# Run 10-fold cross validation on the
rule_clf_cross_validation_scores = print_cross_validation(rule_clf, "Skope Rules", X_train, Y_train)
# Train the skope rules model on the training data and print the results on the test data
rule_clf.fit(X_train, Y_train)
rule_clf_pred = rule_clf.predict(X_test)
print_statistics("Skope Rules", rule_clf_pred, Y_test)
rules = rule_clf.rules_
for rule in rules:
print(rule)
# ************* Ensemble Model: AdaBoostClassifier **********************
# Here we compare three AdaBoostClassifiers on the validation set
# First ada boost model
ada_boost_clf1 = AdaBoostClassifier(n_estimators=50, random_state=0)
ada_boost_clf1.fit(X_val, Y_val)
ada_boost_clf1_ypred = ada_boost_clf1.predict(X_test_val)
print_statistics("Ada Boost Classifier - Model 1", ada_boost_clf1_ypred, Y_test_val)
# Second ada boost model
ada_boost_clf2 = AdaBoostClassifier(n_estimators=50, random_state=0, learning_rate=0.5)
ada_boost_clf2.fit(X_val, Y_val)
ada_boost_clf2_ypred = ada_boost_clf2.predict(X_test_val)
print_statistics("Ada Boost Classifier - Model 2", ada_boost_clf2_ypred, Y_test_val)
# Third ada boost model
ada_boost_clf3 = AdaBoostClassifier(n_estimators=100, random_state=0, learning_rate=0.2)
ada_boost_clf3.fit(X_val, Y_val)
ada_boost_clf3_ypred = ada_boost_clf3.predict(X_test_val)
print_statistics("Ada Boost Classifier - Model 3", ada_boost_clf3_ypred, Y_test_val)
print("\nAs per the results, Model 1 will be chosen for training and testing")
ada_boost_clf = AdaBoostClassifier(n_estimators=100, random_state=0, learning_rate=0.2)
# Compute 10-fold cross validation on the Ada Boost Classifier
ada_boost_cross_validation_scores = print_cross_validation(ada_boost_clf, "Ada Boost Classifier", X_train, Y_train)
# Train the ada boost classifier model on the training data and print the results from testing on the test data
ada_boost_clf.fit(X_train, Y_train)
ada_boost_clf_pred = ada_boost_clf.predict(X_test)
print_statistics("Ada Boost Classifier", ada_boost_clf_pred, Y_test)
# Compute the Paired t signed rank test for a subset of pairs
# of the models on the accuracy and recall per fold respectively
def paired_t_test(data1, data2, model1name, model2name):
accuracy_t, accuracy_p_value = stats.ttest_ind(data1["test_accuracy"], data2["test_accuracy"])
recall_t, recall_p_value = stats.ttest_ind(data1["test_recall"], data2["test_recall"])
print("\nPrinting stats for models " + model1name + " and " + model2name)
print("The T statistic for accuracy is given by " + str(accuracy_t))
if accuracy_p_value < 0.05:
print("We reject the null hypothesis that the difference in accuracy of the models is not significantly different")
else:
print("The accuracy of the models is not significantly different")
print("The T statistic for recall is given by " + str(recall_t))
if recall_p_value < 0.05:
print("We reject the null hypothesis that the difference in recall of the models is not significantly different")
else:
print("The recall of the models belongs is not significantly different")
# Compute the pairwise signed Wilcoxon's test (not for the neural network or the overall model)
paired_t_test(dt_clf_cross_validation_scores, knn_cross_validation_scores, "Decision Tree",
"Nearest Neighbours")
paired_t_test(dt_clf_cross_validation_scores, rule_clf_cross_validation_scores, "Decision Tree",
"Skope Rules")
paired_t_test(dt_clf_cross_validation_scores, ada_boost_cross_validation_scores, "Decision Tree",
"Ada Boost Classifier")
paired_t_test(knn_cross_validation_scores, rule_clf_cross_validation_scores, "Nearest Neighbours",
"Skope Rules")
paired_t_test(knn_cross_validation_scores, ada_boost_cross_validation_scores, "Nearest Neighbours",
"Ada Boost Classifier")
paired_t_test(rule_clf_cross_validation_scores, ada_boost_cross_validation_scores, "Skope Rule",
"Ada Boost Classifier")
# Here we take a majority vote of all the models developed and check out the results!
def compute_majority_vote(pred_array):
predictions = []
for i in range(0, len(pred_array[0])):
votes_for_minority = 0
votes_for_majority = 0
for j in range(0, len(pred_array)):
if pred_array[j][i] == 1:
votes_for_minority = votes_for_minority + 1
else:
votes_for_majority = votes_for_majority + 1
if votes_for_minority > votes_for_majority:
predictions.append(1)
else:
predictions.append(0)
return predictions
# Print the results of majority voting on the test data using all models
model_array = [dt_clf_pred, knn_pred, neural_network_pred, rule_clf_pred, ada_boost_clf_pred]
overall_pred = compute_majority_vote(model_array)
print_statistics("Overall Predictions", overall_pred, Y_test)
# Print the ROC Curves of each model
plt.figure(0).clf()
dt_clf_auc = roc_auc_score(dt_clf_pred, Y_test)
dt_clf_fpr, dt_clf_tpr, _ = roc_curve(dt_clf_pred, Y_test, pos_label=1)
plt.plot(dt_clf_fpr, dt_clf_tpr, label="dt_clf, auc = " + str(dt_clf_auc))
knn_auc = roc_auc_score(knn_pred, Y_test)
knn_fpr, knn_tpr, _ = roc_curve(knn_pred, Y_test)
plt.plot(knn_fpr, knn_tpr, label="knn_clf, auc = " + str(knn_auc))
neural_network_auc = roc_auc_score(neural_network_pred, Y_test)
neural_network_fpr, neural_network_tpr, _ = roc_curve(neural_network_pred, Y_test, pos_label=1)
plt.plot(neural_network_fpr, neural_network_tpr, label="neural net, auc = " + str(neural_network_auc))
rule_clf_auc = roc_auc_score(rule_clf_pred, Y_test)
rule_clf_fpr, rule_clf_tpr, _ = roc_curve(rule_clf_pred, Y_test, pos_label=1)
plt.plot(rule_clf_fpr, rule_clf_tpr, label="rule clf, auc = " + str(rule_clf_auc))
ada_boost_auc = roc_auc_score(ada_boost_clf_pred, Y_test)
ada_boost_fpr, ada_boost_tpr, _ = roc_curve(ada_boost_clf_pred, Y_test, pos_label=1)
plt.plot(ada_boost_fpr, ada_boost_tpr, label="ada boost clf, auc = " + str(ada_boost_auc))
overall_auc = roc_auc_score(overall_pred, Y_test)
overall_fpr, overall_tpr, _ = roc_curve(overall_pred, Y_test, pos_label=1)
plt.plot(overall_fpr, overall_tpr, label="overall clf, auc = "+str(overall_auc))
plt.legend(loc=0)
print('The total running time of this script is {0:0.1f} seconds'.format(time.time() - start))
plt.show()