-
Notifications
You must be signed in to change notification settings - Fork 7
/
solution_03_02ter.py
93 lines (69 loc) · 3.53 KB
/
solution_03_02ter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
## Our splitting strategy doesn't seem to represent the reality of the process....
## inspired from https://hub.packtpub.com/cross-validation-strategies-for-time-series-forecasting-tutorial/
import scipy as sc
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
y = np.array(features['actual'])
# Remove the labels from the features
# axis 1 refers to the columns
X= features.drop(['Unnamed: 0', 'year', 'month', 'day',
'actual', 'forecast_noaa', 'forecast_acc', 'forecast_under',
'week_Fri', 'week_Mon', 'week_Sat', 'week_Sun', 'week_Thurs',
'week_Tues', 'week_Wed'], axis = 1)
## the train data is the 75% most ancient data, the test is the 25% most recent
X_train=np.array(X)[:int(len(X.index)*0.75),:]
X_test=np.array(X)[int(len(X.index)*0.75):,:]
y_train=np.array(y)[:int(len(X.index)*0.75)]
y_test=np.array(y)[int(len(X.index)*0.75):]
grid_values = {'criterion': ['squared_error'],
'n_estimators':[300,600,900],
'max_depth':[2,5,7],
'min_samples_split':[4],
'min_samples_leaf':[2]}# define the hyperparameters you want to test
#with the range over which you want it to be tested.
tscv = TimeSeriesSplit()
#Feed it to the GridSearchCV with the right
#score over which the decision should be taken
grid_tree_acc = GridSearchCV(RandomForestRegressor(),
param_grid = grid_values,
scoring='r2',
cv=tscv,
n_jobs=-1)
grid_tree_acc.fit(X_train, y_train)
print('Grid best parameter (max. r2): ', grid_tree_acc.best_params_)#get the best parameters
print('Grid best score (r2): ', grid_tree_acc.best_score_)#get the best score calculated from the train/validation
#dataset
y_decision_fn_scores_acc=grid_tree_acc.score(X_test,y_test)
print('Grid best parameter (max. r2) model on test: ', y_decision_fn_scores_acc)# get the equivalent score on the test
#dataset : again this is the important metric
## feature importances
RF = grid_tree_acc.best_estimator_
W=RF.feature_importances_#get the weights
sorted_features=sorted([[list(X.columns)[i],abs(W[i])] for i in range(len(W))],key=itemgetter(1),reverse=True)
print('Features sorted per importance in discriminative process')
for f,w in sorted_features:
print('{:>20}\t{:.3f}'.format(f,w))
from sklearn.inspection import permutation_importance
feature_importance = RF.feature_importances_#get the weights
std = np.std([tree.feature_importances_ for tree in grid_tree_acc.best_estimator_.estimators_], axis=0)
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx],xerr=std[sorted_idx][::-1], align='center')
plt.yticks(pos, np.array(list(X.columns))[sorted_idx])
plt.title('Feature Importance (MDI)',fontsize=10)
result = permutation_importance(RF, X_test, y_test, n_repeats=10,
random_state=42, n_jobs=2)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(result.importances[sorted_idx].T,
vert=False, labels=np.array(list(X.columns))[sorted_idx])
plt.title("Permutation Importance (test set)",fontsize=10)
fig.tight_layout()
plt.show()
## plotting the fit
plt.plot(y,RF.predict(X),'ro')
plt.xlabel('True values')
plt.ylabel('Predicted values')
plt.title(str(sc.stats.pearsonr(y,RF.predict(X))[0]))