-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmodel.py
424 lines (340 loc) · 15.7 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
import pandas as pd
import numpy as np
from scipy import stats
from math import sqrt
import matplotlib.pyplot as plt
from preprocessing import spotify_split, scale_data
from acquire import concat_csv_files
from prepare import prepare_df, set_index
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, explained_variance_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, TweedieRegressor, LassoLars
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import IsolationForest, RandomForestRegressor
def get_model_features(df):
'''
This function takes in a DataFrame and returns a DataFrame with features to use in predictive modeling.
'''
df = df.drop(columns=['artist', 'album', 'release_date', 'track_name', 'label', 'album_popularity', 'album_id', 'album_type', 'release_year', 'release_month', 'release_day', 'duration_ms', 'duration_minutes'])
return df
def OLS_model(X, y, X_v, y_v):
'''
This function creates, fits, and evaluates an OLS using linear regression model.
'''
# create the model object
lm = LinearRegression(normalize=True)
# fit the model to our training data.
lm.fit(X, y)
# predict on train
lm_pred = lm.predict(X)
# compute root mean squared error
lm_rmse = sqrt(mean_squared_error(y, lm_pred))
# predict on validate
lm_pred_v = lm.predict(X_v)
# compute root mean squared error
lm_rmse_v = sqrt(mean_squared_error(y_v, lm_pred_v))
print("RMSE for OLS using Linear Regression\n\nOn train data:\n", round(lm_rmse, 6), '\n\n',
"On validate data:\n", round(lm_rmse_v, 6))
return lm_pred, lm_rmse, lm_pred_v, lm_rmse_v
################################## RUN A MODEL ##################################
def get_baseline_metrics(y_tr):
'''
This function creates the variables to return the mean for popularity, and the rmse
of the train data. It also prints out the rmse
'''
# set the baseline variable to the mean of train popularity
bl = np.mean(y_tr)
# calculates the rmse of the baseline to the training data to 6 decimal places
bl_train_rmse = round(sqrt(mean_squared_error(y_tr, np.full(len(y_tr), bl))), 6)
# prints the baseline rmse
print('RMSE (Root Mean Square Error) of Baseline on train data:\n', bl_train_rmse)
return bl, bl_train_rmse
def linear_regression_model(X_tr, y_tr, X_v, y_v, X_te, y_te, **kwargs):
'''
This function runs the ols model on train and validate data
with the option to include key word arguments
'''
# create ols model
lm = LinearRegression(**kwargs)
# fit the model to train data
lm.fit(X_tr, y_tr)
# predict the popularity on the train data
lm_pred = lm.predict(X_tr)
# calculate the rmse on the train data
lm_rmse = sqrt(mean_squared_error(y_tr, lm_pred))
# predict the popularity on the validate data
lm_pred_v = lm.predict(X_v)
# calculate the rmse on the validate data
lm_rmse_v = sqrt(mean_squared_error(y_v, lm_pred_v))
# predict the popularity on the test data
lm_pred_t = lm.predict(X_te)
# calculate the rmse on the test data
lm_rmse_t = sqrt(mean_squared_error(y_te, lm_pred_t))
# print the train rmse
print('RMSE for OLS using Linear Regression \n')
print('On train data:\n', round(lm_rmse, 6), '\n')
return lm_rmse, lm_rmse_v, lm_rmse_t
def lasso_lars(X_tr, y_tr, X_v, y_v, X_te, y_te, **kwargs):
'''
This function runs the lasso lars model on train, validate,
and test data with the option to include key word arguments
'''
# create lasso lars model
lars = LassoLars(**kwargs)
# fit the model to train data
lars.fit(X_tr, y_tr)
# fit the model to train data
lars_pred = lars.predict(X_tr)
# calculate the rmse on the train data
lars_rmse = sqrt(mean_squared_error(y_tr, lars_pred))
# predict the popularity on the validate data
lars_pred_v = lars.predict(X_v)
# calculate the rmse on the validate data
lars_rmse_v = sqrt(mean_squared_error(y_v, lars_pred_v))
# predict the popularity on the test data
lars_pred_t = lars.predict(X_te)
# calculate the rmse on the test data
lars_rmse_t = sqrt(mean_squared_error(y_te, lars_pred_t))
# print the train rmse
print('RMSE for LASSO + LARS \n')
print('On train data:\n', round(lars_rmse, 6), '\n')
return lars_rmse, lars_rmse_v, lars_rmse_t
def polynomial_regression(X_tr, y_tr, X_v, y_v, X_te, y_te, dstring, **kwargs):
'''
This function runs the polynomial features algorithm with a
linear regression model on train, validate, and test data
with the option to include key word arguments.
'''
# create polynomial features object
pf = PolynomialFeatures(**kwargs)
# fit and transform the train data
X_train_sq = pf.fit_transform(X_tr)
# transform the validate data
X_validate_sq = pf.transform(X_v)
# transform the validate data
X_test_sq = pf.transform(X_te)
# create the linear regression model
lm_sq = LinearRegression()
# fit the model to the training data
lm_sq.fit(X_train_sq, y_tr)
# predict the popularity on the train data
lm_sq_pred = lm_sq.predict(X_train_sq)
# calculate the rmse of the train data
lm_sq_rmse = sqrt(mean_squared_error(y_tr, lm_sq_pred))
# predict the popularity on the validate data
lm_sq_pred_v = lm_sq.predict(X_validate_sq)
# calculate the rmse of the validate data
lm_sq_rmse_v = sqrt(mean_squared_error(y_v, lm_sq_pred_v))
# predict the popularity on the test data
lm_sq_pred_t = lm_sq.predict(X_test_sq)
# calculate the rmse of the test data
lm_sq_rmse_t = sqrt(mean_squared_error(y_te, lm_sq_pred_t))
# print the train rmse
print(f'RMSE for Polynomial {dstring} + Linear Regression \n')
print('On train data:\n', round(lm_sq_rmse, 6), '\n')
return lm_sq_rmse, lm_sq_rmse_v, lm_sq_rmse_t, lm_sq_pred_t
def svr_model(X_tr, y_tr, X_v, y_v, X_te, y_te, kern_str, **kwargs):
from sklearn.svm import SVR
# most important SVR parameter is Kernel type.
# It can be linear, polynomial, or gaussian SVR.
# We have a non-linear condition so we can select polynomial or gaussian
# but here we select RBF (a gaussian type) kernel.
# create the model object
svr = SVR(**kwargs)
# fit the model to our training data
svr.fit(X_tr, y_tr)
# predict on train
svr_pred = svr.predict(X_tr)
# compute root mean squared error
svr_rmse = sqrt(mean_squared_error(y_tr, svr_pred))
# predict on validate
svr_pred_v = svr.predict(X_v)
# compute root mean squared error
svr_rmse_v = sqrt(mean_squared_error(y_v, svr_pred_v))
# predict on test
svr_pred_t = svr.predict(X_te)
# compute root mean squared error
svr_rmse_t = sqrt(mean_squared_error(y_te, svr_pred_t))
print(f'RMSE for SVR using {kern_str} Kernel \n')
print('On train data:\n', round(svr_rmse, 6), '\n')
# print(svr_rmse_v)
return svr_rmse, svr_rmse_v, svr_rmse_t
def glm_model(X_tr, y_tr, X_v, y_v, X_te, y_te, d_str, **kwargs):
'''
Generalized Linear Model with a Tweedie distribution.
This estimator can be used to model different GLMs depending
on the power parameter, which determines the underlying distribution.
'''
# create the model object
glm = TweedieRegressor(**kwargs)
# fit the model to our training data
glm.fit(X_tr, y_tr)
# predict on train
glm_pred = glm.predict(X_tr)
# compute root mean squared error
glm_rmse = sqrt(mean_squared_error(y_tr, glm_pred))
# predict on validate
glm_pred_v = glm.predict(X_v)
# compute root mean squared error
glm_rmse_v = sqrt(mean_squared_error(y_v, glm_pred_v))
# predict on test
glm_pred_t = glm.predict(X_te)
# compute root mean squared error
glm_rmse_t = sqrt(mean_squared_error(y_te, glm_pred_t))
print(f'RMSE for GLM using {d_str} Distribution \n')
print('On train data:\n', round(glm_rmse, 6), '\n')
# print(glm_rmse_v)
return glm_rmse, glm_rmse_v, glm_rmse_t, glm_pred_t
################################# EVALUATE & TEST #################################
def evaluate_df(bl_train_rmse, lm_rmse, lars_rmse, lars_rmse_v, lm_sq_rmse, lm_sq_rmse_v, svr_rmse, glm_rmse, glm_rmse_v, glm_rmse_t):
'''
This function creates a dataframe with rmse as the evaluating metric for
the models we used. Columns are the datasets' rmse assessed for each model
'''
columns = ['train_rmse', 'validate_rmse', 'test_rmse']
index = ['baseline', 'ols', 'lassolars', 'pf2_lr', 'SVM', 'GLM']
data = [[bl_train_rmse, '-', '-'],
[lm_rmse, '-', '-'],
[lars_rmse, lars_rmse_v, '-'],
[lm_sq_rmse, lm_sq_rmse_v, '-'],
[svr_rmse, '-', '-'],
[glm_rmse, glm_rmse_v, glm_rmse_t]]
print(f'Model beat baseline by {abs((glm_rmse_t - bl_train_rmse)/bl_train_rmse)*100:.2f}%')
return pd.DataFrame(columns=columns, data=data, index=index).sort_values(by='train_rmse')
def visualize_model(y_predictions, y_actual, baseline_predictions, model_name):
'''
Plot a model's predictions versus what the perfect model would perform and versus the baseline.
Needs a series of the models predictions, the actual values, and the baseline. The model name
is used in the legend of the plot.
'''
# first, a gray line representing the baseline prediction,
# a horizontal line because it only predicts the average
plt.figure(figsize=(16,8))
plt.axhline(baseline_predictions, alpha=.95, color="gray", linestyle=':', label='Baseline Prediction: Average')
# next, straight line for the actual values
# y = x, i.e. when the value is 10 it would be predicted 10, when 60 it would be 60, etc.
plt.plot(y_actual, y_actual, alpha=.9, color="blue", label='The Ideal Line: Actual Values')
# next, a scatter plot representing each observation (song popularity) as the actual value vs. the predicted value
plt.scatter(y_actual, y_predictions,
alpha=.3, color='red', s=30, label=f'Model: {model_name}')
# adding plot labels
plt.legend()
plt.xlabel("Actual Song Popularity", size=13)
plt.ylabel("Predicted Song Popularity", size=13)
plt.title("How Does the Final Model Compare to Baseline? And to Actual Values?", size=15)
plt.show()
def visualize_error(y_predictions, y_actual, baseline_predictions, model_name):
'''
Plot each model prediction by the error as actual value minus predicted.
The further from the horizontal
blue line, the greater the error for that prediction.
'''
plt.figure(figsize=(16,8))
# a straight line for an observation having no error, would lie on this line
plt.axhline(label="The Ideal Line: No Error")
# the actual song popularity vs how far the prediction is from the actual
plt.scatter(y_actual, y_predictions - y_actual,
alpha=.3, color='red', s=30, label=f'Model: {model_name}')
# plot labels
plt.legend()
plt.xlabel("Actual Song Popularity", size=13)
plt.ylabel("Residual/Error: Predicted Popularity - Actual Popularity", size=13)
plt.title("Do the Size of Errors Change as the Popularity Changes?", size=15)
plt.show()
def polyreg_predictions(X_tr_top, X_te_top, y_train):
'''
Runs the polynomial regression model we used
with the combo features and returns the predictions
on the test data and the model itself for later use
'''
# Using the polynomial function to plot regression
from sklearn.preprocessing import PolynomialFeatures
# create polynomial features object
pf = PolynomialFeatures(degree=2).fit(X_tr_top)
# fit and transform the train data
X_train_sq = pf.transform(X_tr_top)
# transform the validate data
#X_validate_sq = pf.transform(X_v_top)
# transform the validate data
X_test_sq = pf.transform(X_te_top)
# create the linear regression model
lm_sq = LinearRegression()
# fit the model to the training data
lm_sq.fit(X_train_sq, y_train)
y_pred_test = lm_sq.predict(X_test_sq)
return lm_sq, y_pred_test, pf
def plot_polyreg(y_test, lm_sq_pred_t, y_pred_test, bl):
'''
Plots the baseline, polynomial line of best fit,
and the actual vs predicted values for popularity
'''
from scipy.optimize import curve_fit
# define the true objective function
def objective(x, a, b, c):
return a * x + b * x**2 + c
x, y = y_test, y_pred_test
popt, _ = curve_fit(objective, x, y)
a, b, c = popt
equation = f'y = %.3f $x^2$ + %.3f x + %.2f' % (a, b, c)
# first, a gray line representing the baseline prediction,
# a horizontal line because it only predicts the average
plt.figure(figsize=(16,8))
plt.axhline(bl, alpha=.95, color="black", linestyle='--', label='Baseline Prediction: Average')
# define a sequence of inputs between the smallest and largest known inputs
x_line = np.arange(min(x), max(x), 1)
# calculate the output for the range
y_line = objective(x_line, a, b, c)
# create a line plot for the mapping function
plt.plot(x_line, y_line, color='dodgerblue', linewidth=3, label='The Line of Best Fit')
# next, a scatter plot representing each observation (song popularity) as the actual value vs. the predicted value
plt.scatter(y_test, lm_sq_pred_t,
alpha=.4, color='red', s=50, label='Model: Polynomial Regression - 2nd Degree')
# adding plot labels
plt.annotate(equation, xy=(35, 38.5), xytext=(40,17), size=14,
arrowprops=dict(facecolor='dodgerblue', shrink=0.05, alpha=.8, linewidth=.5))
plt.legend(fontsize=14)
plt.xticks([0, 20, 40, 60, 80, 100])
plt.yticks([0, 20, 40, 60])
#plt.xlabel("Actual Song Popularity", size=14)
#plt.ylabel("Predicted Song Popularity", size=14)
#plt.title("How Does the Final Model Compare to Baseline? And to Actual Values?", size=15)
plt.show()
print(equation)
################################# FEATURE IMPORTANCES #################################
def get_important_feats(lm_sq, pf, X_tr_top):
'''
This function extracts the most influential features
by creating a dataframe of the most important features/combination,
with their rank
'''
feature_importances = pd.DataFrame(lm_sq.coef_,
index = pf.get_feature_names(X_tr_top.columns),
columns=['importance']).sort_values('importance', ascending=False)
feature_importances.importance = feature_importances.importance
feature_importances.sort_values(by='importance', ascending=False, inplace=True)
feature_importances.reset_index(inplace=True)
feature_importances.index = feature_importances.index + 1
feature_importances.reset_index(inplace=True)
feature_importances.rename(columns={'level_0':'rank'}, inplace=True)
feature_importances.set_index('index', inplace=True)
return feature_importances
def plot_top_feats(feature_importances):
'''
Plots the top 5 positive drivers of popularity
and the top 5 negative drivers according to our model
'''
plt.figure(figsize=(15,7))
plt.subplot(121)
feature_importances.importance.head(5).sort_values(ascending=True).plot(kind='barh')
plt.ylabel('')
plt.yticks(size=16)
plt.xticks([0, 20, 40, 60])
plt.subplot(122)
feature_importances.importance.tail(5).sort_values(ascending=True).plot(kind='barh')
plt.ylabel('')
plt.yticks(size=16)
plt.xticks([0, -40, -80, -120, -160])
plt.tight_layout()