Skip to content

Commit

Permalink
Update ml_model.py
Browse files Browse the repository at this point in the history
- Replaced models with XGBoost, Random Forest and LightGBM for speed up
- Sharded jobs according to n_cpus for better utilization of multi-core processors.
  • Loading branch information
robinbg authored Jan 5, 2023
1 parent 8b02b3b commit 5ef8acf
Showing 1 changed file with 84 additions and 47 deletions.
131 changes: 84 additions & 47 deletions ml_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,18 @@

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV,RandomizedSearchCV

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout


from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import time
import os
import errno

from multiprocessing import cpu_count

n_cpus = cpu_count() - 1


def prepare_rolling_train(df,features_column,label_column,date_column,unique_datetime,testing_windows,first_trade_date_index, max_rolling_window_index,current_index):
if current_index <=max_rolling_window_index:
Expand Down Expand Up @@ -117,9 +121,12 @@ def train_random_forest(X_train, y_train):
# scoring_method = 'neg_mean_absolute_error'
scoring_method = 'neg_mean_squared_error'
#scoring_method = 'neg_mean_squared_log_error'

n_models = 1
for key, val in random_grid.items():
n_models *= len(val)
n_jobs_per_model = min(max(1, n_cpus//n_models), n_cpus)
# my_cv_rf = TimeSeriesSplit(n_splits=5).split(X_train_rf)
rf = RandomForestRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42, n_jobs= n_jobs_per_model)
#RandomizedSearchCV
#randomforest_regressor = RandomizedSearchCV(estimator=rf,
# param_distributions=random_grid,
Expand All @@ -132,7 +139,7 @@ def train_random_forest(X_train, y_train):
randomforest_regressor = GridSearchCV(estimator=rf,
param_grid=random_grid,
cv=3,
n_jobs=-1,
n_jobs=n_cpus // n_jobs_per_model,
scoring=scoring_method,
verbose=0)

Expand Down Expand Up @@ -171,18 +178,24 @@ def train_svm(X_train, y_train):
return model


def train_gbm(X_train, y_train):
gbm = GradientBoostingRegressor(random_state = 42)
def train_lightgbm(X_train, y_train):


# model = gbm.fit(X_train, y_train)

param_grid_gbm = {'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [100, 250, 500,1000]}
n_models = 1
for key, val in param_grid_gbm.items():
n_models *= len(val)
n_jobs_per_model = min(max(1, n_cpus//n_models), n_cpus)
lightgbm = LGBMRegressor(random_state = 42, n_jobs=n_jobs_per_model)
# scoring_method = 'r2'
# scoring_method = 'explained_variance'
# scoring_method = 'neg_mean_absolute_error'
scoring_method = 'neg_mean_squared_error'
#scoring_method = 'neg_mean_squared_log_error'
gbm_regressor = GridSearchCV(estimator=gbm, param_grid=param_grid_gbm,
cv=3, n_jobs=-1, scoring=scoring_method, verbose=0)
gbm_regressor = GridSearchCV(estimator=lightgbm, param_grid=param_grid_gbm,
cv=3, n_jobs=n_cpus // n_jobs_per_model, scoring=scoring_method, verbose=0)

gbm_regressor.fit(X_train, y_train)
model = gbm_regressor.best_estimator_
Expand All @@ -195,7 +208,31 @@ def train_gbm(X_train, y_train):



def train_xgb(X_train, y_train):
xgb = XGBRegressor(random_state = 42, n_jobs=10)

param_grid_gbm = {'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [100, 250, 500,1000]}
n_models = 1
for key, val in param_grid_gbm.items():
n_models *= len(val)
n_jobs_per_model = min(max(1, n_cpus//n_models), n_cpus)
xgb = XGBRegressor(random_state = 42, n_jobs=n_jobs_per_model)
# scoring_method = 'r2'
# scoring_method = 'explained_variance'
# scoring_method = 'neg_mean_absolute_error'
scoring_method = 'neg_mean_squared_error'
#scoring_method = 'neg_mean_squared_log_error'
xgb_regressor = GridSearchCV(estimator=xgb, param_grid=param_grid_gbm,
cv=3, n_jobs=n_cpus // n_jobs_per_model, scoring=scoring_method, verbose=0)

xgb_regressor.fit(X_train, y_train)
model = xgb_regressor.best_estimator_
'''
gbm_regressor = GradientBoostingRegressor()
model = gbm_regressor.fit(X_train, y_train)
'''
return model
def train_ada(X_train, y_train):
ada = AdaBoostRegressor()

Expand Down Expand Up @@ -255,18 +292,17 @@ def run_4model(df,features_column, label_column,date_column,tic_column,
max_rolling_window_index=44):
## initialize all the result tables
## need date as index and unique tic name as columns
df_predict_lr = pd.DataFrame(columns=unique_ticker, index=trade_date)
df_predict_rf = pd.DataFrame(columns=unique_ticker, index=trade_date)
df_predict_ridge = pd.DataFrame(columns=unique_ticker, index=trade_date)
df_predict_gbm = pd.DataFrame(columns=unique_ticker, index=trade_date)

df_predict_xgb = pd.DataFrame(columns=unique_ticker, index=trade_date)
df_predict_best = pd.DataFrame(columns=unique_ticker, index=trade_date)
df_best_model_name = pd.DataFrame(columns=['model_name'], index=trade_date)
evaluation_record = {}
# first trade date is 1995-06-01
# fist_trade_date_index = 20
# testing_windows = 6

import re
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
for i in range(first_trade_date_index, len(unique_datetime)):
try:
# prepare training data
Expand Down Expand Up @@ -303,33 +339,38 @@ def run_4model(df,features_column, label_column,date_column,tic_column,
current_index=i)

# Training
lr_model = train_linear_regression(X_train, y_train)
# lr_model = train_linear_regression(X_train, y_train)

t = time.perf_counter()
xgb_model = train_xgb(X_train, y_train)
print(f"xgb:{time.perf_counter() - t}s")
t = time.perf_counter()
gbm_model = train_lightgbm(X_train, y_train)
print(f"gbm:{time.perf_counter() - t}s")
t =time.perf_counter()
rf_model = train_random_forest(X_train, y_train)
ridge_model = train_ridge(X_train, y_train)
gbm_model = train_gbm(X_train, y_train)
print(f"rf:{time.perf_counter() - t}s")
# ridge_model = train_ridge(X_train, y_train)



# Validation
lr_eval = evaluate_model(lr_model, X_test, y_test)
rf_eval = evaluate_model(rf_model, X_test, y_test)
ridge_eval = evaluate_model(ridge_model, X_test, y_test)
gbm_eval = evaluate_model(gbm_model, X_test, y_test)

xgb_eval = evaluate_model(xgb_model, X_test, y_test)
gbm_eval = evaluate_model(gbm_model, X_test ,y_test)
# Trading
y_trade_lr = lr_model.predict(X_trade)

y_trade_rf = rf_model.predict(X_trade)
y_trade_ridge = ridge_model.predict(X_trade)
y_trade_gbm = gbm_model.predict(X_trade)


y_trade_xgb = xgb_model.predict(X_trade)
y_trade_gbm = gbm_model.predict(X_trade)
# Decide the best model
eval_data = [[lr_eval, y_trade_lr],
eval_data = [
[rf_eval, y_trade_rf] ,
[ridge_eval, y_trade_ridge],
[xgb_eval, y_trade_xgb],
[gbm_eval, y_trade_gbm]
]
eval_table = pd.DataFrame(eval_data, columns=['model_eval', 'model_predict_return'],
index=['lr', 'rf','ridge','gbm'])
index=['rf', 'xgb', 'gbm'])


evaluation_record[unique_datetime[i]]=eval_table
Expand All @@ -345,22 +386,20 @@ def run_4model(df,features_column, label_column,date_column,tic_column,
df_best_model_name.loc[unique_datetime[i]] = best_model_name

# Prepare Predicted Return table
append_return_table(df_predict_lr, unique_datetime, y_trade_lr, trade_tic, current_index=i)
append_return_table(df_predict_rf, unique_datetime, y_trade_rf, trade_tic, current_index=i)
append_return_table(df_predict_ridge, unique_datetime, y_trade_ridge, trade_tic, current_index=i)
append_return_table(df_predict_xgb, unique_datetime, y_trade_xgb, trade_tic, current_index=i)
append_return_table(df_predict_gbm, unique_datetime, y_trade_gbm, trade_tic, current_index=i)

append_return_table(df_predict_best, unique_datetime, y_trade_best, trade_tic, current_index=i)

print('Trade Date: ', unique_datetime[i])

except Exception:
traceback.print_exc()
df_evaluation = get_model_evaluation_table(evaluation_record,trade_date)
return (df_predict_lr,
return (
df_predict_rf,
df_predict_ridge,
df_predict_gbm,
df_predict_xgb,
df_predict_best,
df_best_model_name,
evaluation_record,
Expand All @@ -374,20 +413,19 @@ def get_model_evaluation_table(evaluation_record,trade_date):
evaluation_list.append(evaluation_record[d]['model_eval'].values)
except:
print('error')
df_evaluation = pd.DataFrame(evaluation_list,columns = ['linear_regression', 'random_forest','ridge','gbm'])
df_evaluation = pd.DataFrame(evaluation_list,columns = ['rf', 'xgb', 'gbm'])
df_evaluation.index = trade_date
return df_evaluation

def save_model_result(sector_result,sector_name):
df_predict_lr = sector_result[0].astype(np.float64)
df_predict_rf = sector_result[1].astype(np.float64)
df_predict_ridge = sector_result[2].astype(np.float64)
df_predict_gbm = sector_result[3].astype(np.float64)
df_predict_best = sector_result[4].astype(np.float64)

df_best_model_name = sector_result[5]
df_evaluation_score = sector_result[6]
df_model_score = sector_result[7]
df_predict_rf = sector_result[0].astype(np.float64)
df_predict_gbm = sector_result[1].astype(np.float64)
df_predict_xgb = sector_result[2].astype(np.float64)
df_predict_best = sector_result[3].astype(np.float64)

df_best_model_name = sector_result[4]
df_evaluation_score = sector_result[5]
df_model_score = sector_result[6]



Expand All @@ -400,10 +438,9 @@ def save_model_result(sector_result,sector_name):
raise


df_predict_lr.to_csv('results/'+sector_name+'/df_predict_lr.csv')
df_predict_rf.to_csv('results/'+sector_name+'/df_predict_rf.csv')
df_predict_ridge.to_csv('results/'+sector_name+'/df_predict_ridge.csv')
df_predict_gbm.to_csv('results/'+sector_name+'/df_predict_gbm.csv')
df_predict_xgb.to_csv('results/'+sector_name+'/df_predict_xgb.csv')
df_predict_best.to_csv('results/'+sector_name+'/df_predict_best.csv')
df_best_model_name.to_csv('results/'+sector_name+'/df_best_model_name.csv')
#df_evaluation_score.to_csv('results/'+sector_name+'/df_evaluation_score.csv')
Expand Down

0 comments on commit 5ef8acf

Please sign in to comment.