-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtestingbayes.py
106 lines (84 loc) · 4.32 KB
/
testingbayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn import metrics
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
# Load the dataset
training_data_path = 'Compiled and Labelled - Sheet1 Numeric.csv' # Path to the data file
ds = pd.read_csv(training_data_path)
# Separate features and target labels
X = ds[["channel 1", "channel 2", "channel 3", "channel 4",
"channel 5", "channel 6", "channel 7", "channel 8"]]
y = ds["Class"]
# Check for highly similar samples between train and test sets
def check_similarity(X_train, X_test, threshold=0.95):
# Compute cosine similarity matrix between training and testing sets
similarity_matrix = cosine_similarity(X_train, X_test)
high_similarity_pairs = (similarity_matrix > threshold).sum()
print("Number of highly similar samples between train and test:", high_similarity_pairs)
if high_similarity_pairs > 0:
print(f"Warning: Found {high_similarity_pairs} samples with similarity over {threshold}. This may indicate data leakage.")
# Split the data into training and testing sets, ensuring random state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 ,random_state=42)
# # Check for similar samples
# check_similarity(X_train, X_test)
# # Verify class distribution to avoid bias between training and test sets
# print("\nTraining set class distribution:")
# print(y_train.value_counts(normalize=True))
# print("\nTesting set class distribution:")
# print(y_test.value_counts(normalize=True))
# Initialize and train the model with cross-validation to avoid overfitting and assess performance more reliably
def objective(n_estimators, max_depth, min_samples_split, max_features):
model = RandomForestRegressor(n_estimators=int(n_estimators),
max_depth=int(max_depth),
min_samples_split = int(min_samples_split),
max_features=min(max_features,0.999),
random_state = 42)
return -1 * cross_val_score(model, X_train, y_train, cv = 3, scoring="neg_mean_squared_error").mean()
# Bounds for hyperparameters
param_bounds = {
'n_estimators': (10, 250),
'max_depth': (1, 50),
'min_samples_split': (2, 25),
'max_features': (0.1, 0.999),
}
optimizer = BayesianOptimization(f = objective, pbounds=param_bounds, random_state = 42)
optimizer.maximize(init_points = 5, n_iter = 15)
best_params = optimizer.max['params']
best_params
final_model = RandomForestRegressor(n_estimators=int(best_params['n_estimators']),
max_depth=int(best_params['max_depth']),
min_samples_split=int(best_params['min_samples_split']),
max_features=best_params['max_features'],
random_state=42)
final_model.fit(X_train, y_train)
score = final_model.score(X_test, y_test)
print(f"Test R^2 Score: {score}")
best_params_formatted = {
'n_estimators': int(best_params['n_estimators']),
'max_depth': int(best_params['max_depth']),
'min_samples_split': int(best_params['min_samples_split']),
'max_features': best_params['max_features']
}
optimized_rf = RandomForestRegressor(**best_params_formatted, random_state=42)
optimized_rf.fit(X_train, y_train)
y_pred = optimized_rf.predict(X_test)
score = metrics.accuracy_score(y_test, y_pred)
print(f"Test R^2 Score with Optimized Hyperparameters: {score}")
# # Using Stratified K-Fold cross-validation
# skf = StratifiedKFold(n_splits=5)
# cross_val_scores = cross_val_score(model, X, y, cv=skf)
# print("\nCross-validation scores across folds:", cross_val_scores)
# print("Mean cross-validation score:", cross_val_scores.mean())
# # Fit the model on the training set
# model.fit(X_train, y_train)
# # Evaluate on the test set to check generalization
# y_pred = model.predict(X_test)
# print("\nTest accuracy of the model:", metrics.accuracy_score(y_test, y_pred))
# # Save the model for future use
# filename = 'model.pkl'
# pickle.dump(model, open(filename, 'wb'))