diff --git a/src/calidhayte/calibrate.py b/src/calidhayte/calibrate.py index 0ea911c..f377c21 100644 --- a/src/calidhayte/calibrate.py +++ b/src/calidhayte/calibrate.py @@ -13,8 +13,10 @@ from collections.abc import Iterable from copy import deepcopy as dc import logging +from pathlib import Path +import pickle import sys -from typing import Any, List, Literal, Union +from typing import Any, List, Literal, Optional, Union import warnings # import bambi as bmb @@ -33,19 +35,18 @@ from sklearn.gaussian_process import kernels as kern import sklearn.preprocessing as pre from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV +from sklearn.model_selection import train_test_split from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline import xgboost as xgb -_logger = logging.getLogger("pymc") -_logger.setLevel(logging.ERROR) - def cont_strat_folds( df: pd.DataFrame, target_var: str, splits: int = 5, strat_groups: int = 5, + validation_size: float = 0.1, seed: int = 62 ) -> pd.DataFrame: """ @@ -59,6 +60,8 @@ def cont_strat_folds( Number of folds to make. strat_groups : int, default=10 Number of groups to split data in to for stratification. + validation_size : float, default = 0.1 + Size of measurements to keep aside for validation seed : int, default=62 Random state to use. @@ -97,22 +100,35 @@ def cont_strat_folds( """ _df = df.copy() - _df['Fold'] = -1 + _df['Fold'] = 'Validation' skf = StratifiedKFold( n_splits=splits, random_state=seed, shuffle=True ) - _df['Group'] = pd.cut( + _df['Group'] = pd.qcut( _df.loc[:, target_var], strat_groups, labels=False ) + group_label = _df.loc[:, 'Group'] + train_set, val_set = train_test_split( + _df, + test_size=validation_size, + random_state=seed, + shuffle=True, + stratify=group_label + ) + + group_label = train_set.loc[:, 'Group'] + for fold_number, (_, v) in enumerate(skf.split(group_label, group_label)): - _df.loc[v, 'Fold'] = fold_number - return _df.drop('Group', axis=1) + _temp_df = train_set.iloc[v, :] + _temp_df.loc[:, 'Fold'] = fold_number + train_set.iloc[v, :] = _temp_df + return pd.concat([train_set, val_set]).sort_index().drop('Group', axis=1) class Calibrate: @@ -227,6 +243,11 @@ def __init__( 'Quantile Transform (Gaussian)', ] ] = 'None', + random_search_iterations: int = 25, + validation_size: float = 0.1, + verbosity: int = 0, + n_jobs: int = -1, + pickle_path: Optional[Path] = None, seed: int = 62 ): """Initialises class @@ -326,13 +347,31 @@ def __init__( """ The scaling algorithm(s) to preprocess the data with """ + self.y_data = cont_strat_folds( + y_data.loc[join_index, :], + target, + folds, + strat_groups, + validation_size, + seed + ) + """ + The data that `x_data` will be calibrated against. A '*Fold*' + column is added using the `const_strat_folds` function which splits + the data into k stratified folds (where k is the value of + `folds`). It splits the continuous measurements into n bins (where n + is the value of `strat_groups`) and distributes each bin equally + across all folds. This significantly reduces the chances of one fold + containing a skewed distribution relative to the whole dataset. + """ if isinstance(scaler, str): if scaler == "All": - if not bool(self.x_data.ge(0).all(axis=None)): + if bool(self.x_data.le(0).any(axis=None)) or bool(self.y_data.drop('Fold', axis=1).le(0).any(axis=None)): + self.scaler_list.pop('Box-Cox Transform') warnings.warn( + 'WARN: ' 'Box-Cox is not compatible with provided measurements' ) - self.scaler_list.pop('Box-Cox Transform') self.scaler.extend(self.scaler_list.keys()) elif scaler in self.scaler_list.keys(): self.scaler.append(scaler) @@ -341,8 +380,9 @@ def __init__( warnings.warn(f'Scaling algorithm {scaler} not recognised') elif isinstance(scaler, (tuple, list)): for sc in scaler: - if sc == 'Box-Cox Transform' and not bool( - self.x_data.ge(0).all(axis=None) + if sc == 'Box-Cox Transform' and not any( + bool(self.x_data.lt(0).any(axis=None)), + bool(self.y_data.lt(0).any(axis=None)) ): warnings.warn( 'Box-Cox is not compatible with provided measurements' @@ -362,22 +402,6 @@ def __init__( ) self.scaler.append('None') - self.y_data = cont_strat_folds( - y_data.loc[join_index, :], - target, - folds, - strat_groups, - seed - ) - """ - The data that `x_data` will be calibrated against. A '*Fold*' - column is added using the `const_strat_folds` function which splits - the data into k stratified folds (where k is the value of - `folds`). It splits the continuous measurements into n bins (where n - is the value of `strat_groups`) and distributes each bin equally - across all folds. This significantly reduces the chances of one fold - containing a skewed distribution relative to the whole dataset. - """ self.models: dict[str, # Technique name dict[str, # Scaling technique dict[str, # Variable combo @@ -423,6 +447,19 @@ def __init__( """ The number of folds used in k-fold cross validation """ + self.rs_iter: int = random_search_iterations + """ + Number of iterations to use in random search + """ + self.verbosity: int = verbosity + """ + Verbosity of output when using random search + """ + self.n_jobs: int = n_jobs + """ + Number of processor cores to use + """ + self.pkl = pickle_path def _sklearn_regression_meta( self, @@ -460,8 +497,11 @@ def _sklearn_regression_meta( """ x_secondary_cols = self.x_data.drop(self.target, axis=1).columns # All columns in x_data that aren't the target variable - products = [[np.nan, col] for col in x_secondary_cols] - secondary_vals = pd.MultiIndex.from_product(products) + if len(x_secondary_cols) > 0: + products = [[np.nan, col] for col in x_secondary_cols] + secondary_vals = pd.MultiIndex.from_product(products) + else: + secondary_vals = [None] # Get all possible combinations of secondary variables in a pandas # MultiIndex if self.models.get(name) is None: @@ -476,7 +516,10 @@ def _sklearn_regression_meta( # technique yet, add its key to the nested dictionary for sec_vals in secondary_vals: # Loop over all combinations of secondary values - vals = [self.target] + [v for v in sec_vals if v == v] + if sec_vals is not None: + vals = [self.target] + [v for v in sec_vals if v == v] + else: + vals = [self.target] vals_str = ' + '.join(vals) if len(vals) < min_coeffs or len(vals) > max_coeffs: # Skip if number of coeffs doesn't lie within acceptable @@ -485,29 +528,30 @@ def _sklearn_regression_meta( # only works with one variable continue self.models[name][scaler][vals_str] = dict() - if random_search: - pipeline = Pipeline([ - ("Selector", ColumnTransformer([ - ("selector", "passthrough", vals) - ], remainder="drop") - ), - ("Scaler", self.scaler_list[scaler]), - ("Regression", reg) - ]) - pipeline.fit( - self.x_data, - self.y_data.loc[:, self.target] - ) - self.models[name][scaler][vals_str][0] = dc(pipeline) - continue - +# if random_search: +# pipeline = Pipeline([ +# ("Selector", ColumnTransformer([ +# ("selector", "passthrough", vals) +# ], remainder="drop") +# ), +# ("Scaler", self.scaler_list[scaler]), +# ("Regression", reg) +# ]) +# pipeline.fit( +# self.x_data, +# self.y_data.loc[:, self.target] +# ) +# self.models[name][scaler][vals_str][0] = dc(pipeline) +# continue +# for fold in self.y_data.loc[:, 'Fold'].unique(): + if fold == 'Validation': + continue y_data = self.y_data[ self.y_data.loc[:, 'Fold'] != fold ] if reg in ['t', 'gaussian']: # If using PyMC bayesian model, - # format data and build model using bambi # then store result in pipeline # Currently doesn't work as PyMC models # can't be pickled, so don't function with deepcopy. @@ -551,7 +595,15 @@ def _sklearn_regression_meta( self.x_data.loc[y_data.index, :], y_data.loc[:, self.target] ) - self.models[name][scaler][vals_str][fold] = dc(pipeline) + if isinstance(self.pkl, Path): + pkl_path = self.pkl / name / scaler / vals_str + pkl_path.mkdir(parents=True, exist_ok=True) + pkl_file = pkl_path / f'{fold}.pkl' + with pkl_file.open('wb') as pkl: + pickle.dump(pipeline, pkl) + self.models[name][scaler][vals_str][fold] = pkl_file + else: + self.models[name][scaler][vals_str][fold] = dc(pipeline) def pymc_bayesian( self, @@ -602,7 +654,7 @@ def linreg( ] = { }, **kwargs - ): + ): """ Fit x on y via linear regression @@ -625,6 +677,9 @@ def linreg( classifier = RandomizedSearchCV( lm.LinearRegression(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -682,6 +737,9 @@ def ridge( classifier = RandomizedSearchCV( lm.Ridge(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -756,6 +814,9 @@ def lasso( classifier = RandomizedSearchCV( lm.Lasso(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -791,80 +852,6 @@ def lasso_cv( random_search=True ) - def multi_task_lasso( - self, - name: str = "Multi-task Lasso Regression", - random_search: bool = False, - parameters: dict[ - str, - Union[ - scipy.stats.rv_continuous, - List[Union[int, str, float]] - ] - ] = { - 'alpha': uniform(loc=0, scale=2), - 'tol': uniform(loc=0, scale=1), - 'selection': ['cyclic', 'random'] - }, - **kwargs - ): - """ - Fit x on y via multitask lasso regression - - Parameters - ---------- - name : str, default="Multi-task Lasso Regression" - Name of classification technique. - random_search : bool, default=False - Whether to perform RandomizedSearch to optimise parameters - parameters : dict[ - str, - Union[ - scipy.stats.rv_continuous, - List[Union[int, str, float]] - ] - ], default=Preset distributions - The parameters used in RandomizedSearchCV - """ - if random_search: - classifier = RandomizedSearchCV( - lm.MultiTaskLasso(**kwargs), - parameters, - cv=self.folds - ) - else: - classifier = lm.MultiTaskLasso(**kwargs) - self._sklearn_regression_meta( - classifier, - f'{name}{" (Random Search)" if random_search else ""}', - random_search=random_search - ) - - def multi_task_lasso_cv( - self, - name: str = "Multi-task Lasso Regression (Cross Validated)", - random_search: bool = False, - **kwargs - ): - """ - Fit x on y via cross-validated multitask lasso regression. - Already cross validated so random search not required - - Parameters - ---------- - name : str, default="Multi-task Lasso Regression (Cross Validated)" - Name of classification technique - random_search : bool, default=False - Not used - - """ - _ = random_search - self._sklearn_regression_meta( - lm.MultiTaskLassoCV(**kwargs, cv=self.folds), - name, - random_search=True - ) - def elastic_net( self, name: str = "Elastic Net Regression", @@ -905,6 +892,9 @@ def elastic_net( classifier = RandomizedSearchCV( lm.ElasticNet(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -939,82 +929,6 @@ def elastic_net_cv( random_search=True ) - def multi_task_elastic_net( - self, - name: str = "Multi-task Elastic Net Regression", - random_search: bool = False, - parameters: dict[ - str, - Union[ - scipy.stats.rv_continuous, - List[Union[int, str, float]] - ] - ] = { - 'alpha': uniform(loc=0, scale=2), - 'l1_ratio': uniform(loc=0, scale=1), - 'tol': uniform(loc=0, scale=1), - 'selection': ['cyclic', 'random'] - }, - **kwargs - ): - """ - Fit x on y via elastic net regression - - Parameters - ---------- - name : str, default="Multi-task Elastic Net Regression" - Name of classification technique. - random_search : bool, default=False - Whether to perform RandomizedSearch to optimise parameters - parameters : dict[ - str, - Union[ - scipy.stats.rv_continuous, - List[Union[int, str, float]] - ] - ], default=Preset distributions - The parameters used in RandomizedSearchCV - """ - if random_search: - classifier = RandomizedSearchCV( - lm.MultiTaskElasticNet(**kwargs), - parameters, - cv=self.folds - ) - else: - classifier = lm.MultiTaskElasticNet(**kwargs) - self._sklearn_regression_meta( - classifier, - f'{name}{" (Random Search)" if random_search else ""}', - random_search=random_search - ) - - def multi_task_elastic_net_cv( - self, - name: str = "Multi-Task Elastic Net Regression (Cross Validated)", - random_search: bool = False, - **kwargs - ): - """ - Fit x on y via cross-validated multi-task elastic net regression. - Already cross validated so random search not required - - Parameters - ---------- - name : str, default="Multi-Task Elastic Net Regression \ - (Cross Validated)" - Name of classification technique - random_search : bool, default=False - Not used - - """ - _ = random_search - self._sklearn_regression_meta( - lm.MultiTaskElasticNetCV(**kwargs, cv=self.folds), - name, - random_search=True - ) - def lars( self, name: str = "Least Angle Regression", @@ -1052,6 +966,9 @@ def lars( classifier = RandomizedSearchCV( lm.Lars(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -1099,6 +1016,9 @@ def lars_lasso( classifier = RandomizedSearchCV( lm.LassoLars(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -1146,6 +1066,9 @@ def omp( classifier = RandomizedSearchCV( lm.OrthogonalMatchingPursuit(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -1198,6 +1121,9 @@ def bayesian_ridge( classifier = RandomizedSearchCV( lm.BayesianRidge(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -1249,6 +1175,9 @@ def bayesian_ard( classifier = RandomizedSearchCV( lm.ARDRegression(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -1286,12 +1215,12 @@ def tweedie( Name of classification technique. random_search : bool, default=False Whether to perform RandomizedSearch to optimise parameters - parameters : dict[ - str, - Union[ - scipy.stats.rv_continuous, - List[Union[int, str, float]] - ] + parameters : dict[\ + str,\ + Union[\ + scipy.stats.rv_continuous,\ + List[Union[int, str, float]]\ + ]\ ], default=Preset distributions The parameters used in RandomizedSearchCV """ @@ -1299,6 +1228,9 @@ def tweedie( classifier = RandomizedSearchCV( lm.TweedieRegressor(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -1370,6 +1302,9 @@ def stochastic_gradient_descent( classifier = RandomizedSearchCV( lm.SGDRegressor(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -1402,7 +1337,7 @@ def passive_aggressive( **kwargs ): """ - Fit x on y via stochastic gradient descent regression + Fit x on y via passive aggressive regression Parameters ---------- @@ -1423,6 +1358,9 @@ def passive_aggressive( classifier = RandomizedSearchCV( lm.PassiveAggressiveRegressor(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -1448,7 +1386,8 @@ def ransac( lm.LinearRegression(), lm.TheilSenRegressor(), lm.LassoLarsCV() - ] + ], + 'min_samples': [1E-4, 1E-3, 1E-2] }, **kwargs ): @@ -1474,6 +1413,9 @@ def ransac( classifier = RandomizedSearchCV( lm.RANSACRegressor(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -1521,6 +1463,9 @@ def theil_sen( classifier = RandomizedSearchCV( lm.TheilSenRegressor(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -1570,6 +1515,9 @@ def huber( classifier = RandomizedSearchCV( lm.HuberRegressor(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -1593,7 +1541,6 @@ def quantile( ] = { 'quantile': uniform(loc=0, scale=2), 'alpha': uniform(loc=0, scale=2), - 'tol': uniform(loc=0, scale=1), 'solver': [ 'highs-ds', 'highs-ipm', @@ -1626,10 +1573,13 @@ def quantile( classifier = RandomizedSearchCV( lm.QuantileRegressor(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds - ) + ) else: - classifier = lm.QuantileRegressor(**kwargs) + classifier = lm.QuantileRegressor(solver='highs', **kwargs) self._sklearn_regression_meta( classifier, f'{name}{" (Random Search)" if random_search else ""}', @@ -1688,6 +1638,9 @@ def decision_tree( classifier = RandomizedSearchCV( tree.DecisionTreeRegressor(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -1750,6 +1703,9 @@ def extra_tree( classifier = RandomizedSearchCV( tree.ExtraTreeRegressor(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -1771,8 +1727,7 @@ def random_forest( List[Union[int, str, float]] ] ] = { - 'n_estimators': [5, 10, 25, 50, 100, 200, 250, 500], - 'bootstrap': [True, False], + 'n_estimators': [5, 25, 100, 250], 'max_samples': uniform(loc=0.01, scale=0.99), 'criterion': [ 'squared_error', @@ -1809,12 +1764,15 @@ def random_forest( """ if random_search: classifier = RandomizedSearchCV( - en.RandomForestRegressor(**kwargs), + en.RandomForestRegressor(bootstrap=True, **kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: - classifier = en.RandomForestRegressor(**kwargs) + classifier = en.RandomForestRegressor(bootstrap=True, **kwargs) self._sklearn_regression_meta( classifier, f'{name}{" (Random Search)" if random_search else ""}', @@ -1832,8 +1790,7 @@ def extra_trees_ensemble( List[Union[int, str, float]] ] ] = { - 'n_estimators': [5, 10, 25, 50, 100, 200, 250, 500], - 'bootstrap': [True, False], + 'n_estimators': [5, 25, 100, 250], 'max_samples': uniform(loc=0.01, scale=0.99), 'criterion': [ 'squared_error', @@ -1870,12 +1827,15 @@ def extra_trees_ensemble( """ if random_search: classifier = RandomizedSearchCV( - en.ExtraTreesRegressor(**kwargs), + en.ExtraTreesRegressor(bootstrap=True, **kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: - classifier = en.ExtraTreesRegressor(**kwargs) + classifier = en.ExtraTreesRegressor(bootstrap=True, **kwargs) self._sklearn_regression_meta( classifier, f'{name}{" (Random Search)" if random_search else ""}', @@ -1900,7 +1860,7 @@ def gradient_boost_regressor( 'quantile' ], 'learning_rate': uniform(loc=0, scale=2), - 'n_estimators': [5, 10, 25, 50, 100, 200, 250, 500], + 'n_estimators': [5, 25, 100, 250], 'subsample': uniform(loc=0.01, scale=0.99), 'criterion': [ 'friedman_mse', @@ -1943,6 +1903,9 @@ def gradient_boost_regressor( classifier = RandomizedSearchCV( en.GradientBoostingRegressor(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -2001,6 +1964,9 @@ def hist_gradient_boost_regressor( classifier = RandomizedSearchCV( en.HistGradientBoostingRegressor(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -2041,15 +2007,6 @@ def mlp_regressor( 'adam' ], 'alpha': uniform(loc=0, scale=0.1), - 'batch_size': [ - 'auto', - 20, - 200, - 500, - 1000, - 5000, - 10000 - ], 'learning_rate': [ 'constant', 'invscaling', @@ -2062,7 +2019,7 @@ def mlp_regressor( 'momentum': uniform(loc=0.1, scale=0.9), 'beta_1': uniform(loc=0.1, scale=0.9), 'beta_2': uniform(loc=0.1, scale=0.9), - 'epsilon': uniform(loc=1E8, scale=1E6), + 'epsilon': uniform(loc=1E-8, scale=1E-6), }, **kwargs @@ -2089,6 +2046,9 @@ def mlp_regressor( classifier = RandomizedSearchCV( nn.MLPRegressor(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -2120,7 +2080,7 @@ def svr( 'gamma': ['scale', 'auto'], 'coef0': uniform(loc=0, scale=1), 'C': uniform(loc=0.1, scale=1.9), - 'epsilon': uniform(loc=1E8, scale=1), + 'epsilon': uniform(loc=1E-8, scale=1), 'shrinking': [True, False] }, **kwargs @@ -2147,6 +2107,9 @@ def svr( classifier = RandomizedSearchCV( svm.SVR(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -2169,7 +2132,7 @@ def linear_svr( ] ] = { 'C': uniform(loc=0.1, scale=1.9), - 'epsilon': uniform(loc=1E8, scale=1), + 'epsilon': uniform(loc=1E-8, scale=1), 'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'] }, **kwargs @@ -2196,6 +2159,9 @@ def linear_svr( classifier = RandomizedSearchCV( svm.LinearSVR(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -2253,6 +2219,9 @@ def nu_svr( classifier = RandomizedSearchCV( svm.NuSVR(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -2283,7 +2252,7 @@ def gaussian_process( kern.CompoundKernel, kern.ExpSineSquared ], - 'alpha': uniform(loc=0, scale=1E8), + 'alpha': uniform(loc=0, scale=1E-8), 'normalize_y': [True, False] }, **kwargs @@ -2310,6 +2279,9 @@ def gaussian_process( classifier = RandomizedSearchCV( gp.GaussianProcessRegressor(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -2320,6 +2292,7 @@ def gaussian_process( random_search=random_search ) + def isotonic( self, name: str = "Isotonic Regression", @@ -2357,6 +2330,9 @@ def isotonic( classifier = RandomizedSearchCV( iso.IsotonicRegression(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -2379,7 +2355,7 @@ def xgboost( List[Union[int, str, float]] ] ] = { - 'n_estimators': [5, 10, 25, 50, 100, 200, 250, 500], + 'n_estimators': [5, 25, 100, 250], 'max_bins': [1, 3, 7, 15, 31, 63, 127, 255], 'grow_policy': [ 'depthwise', @@ -2416,6 +2392,9 @@ def xgboost( classifier = RandomizedSearchCV( xgb.XGBRegressor(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -2437,7 +2416,7 @@ def xgboost_rf( List[Union[int, str, float]] ] ] = { - 'n_estimators': [5, 10, 25, 50, 100, 200, 250, 500], + 'n_estimators': [5, 25, 100, 250], 'max_bin': [1, 3, 7, 15, 31, 63, 127, 255], 'grow_policy': [ 'depthwise', @@ -2474,6 +2453,9 @@ def xgboost_rf( classifier = RandomizedSearchCV( xgb.XGBRFRegressor(**kwargs), parameters, + n_iter=self.rs_iter, + verbose=self.verbosity, + n_jobs=self.n_jobs, cv=self.folds ) else: @@ -2528,3 +2510,9 @@ def return_models(self) -> dict[str, # Technique trained on """ return self.models + + def clear_models(self): + """ + """ + del self.models + self.models = dict() diff --git a/src/calidhayte/results.py b/src/calidhayte/results.py index cd8b2af..2212e51 100644 --- a/src/calidhayte/results.py +++ b/src/calidhayte/results.py @@ -7,11 +7,12 @@ [^skl]: https://scikit-learn.org/stable/modules/classes.html """ -try: - from typing import Any, TypeAlias -except ImportError: - from typing_extensions import Any, TypeAlias +import logging +from pathlib import Path +import pickle +from typing import Any, TypeAlias, Union +import numpy as np import pandas as pd from sklearn import metrics as met from sklearn.pipeline import Pipeline @@ -20,11 +21,13 @@ dict[str, # Scaling technique dict[str, # Variable combo dict[int, # Fold - Pipeline]]]] + Union[ + Path, Pipeline]]]]] """ Type alias for the nested dictionaries that the models are stored in """ +logger = logging.getLogger(f'__main__.{__name__}') class Results: """ @@ -39,7 +42,8 @@ def __init__( x_data: pd.DataFrame, y_data: pd.DataFrame, target: str, - models: CoefficientPipelineDict + models: CoefficientPipelineDict, + errors: pd.DataFrame = pd.DataFrame() ): """ Initialises the class @@ -55,6 +59,9 @@ def __init__( the name of a column in both `x_data` and `y_data`. models : CoefficientPipelineDict The calibrated models. + errors : pd.DataFrame + Any previously calculated errors. Useful if you need to skip over + previous calculations. """ if target not in x_data.columns or target not in y_data.columns: not_in_x = target not in x_data.columns @@ -107,7 +114,7 @@ def __init__( ``` """ - self.errors: pd.DataFrame = pd.DataFrame() + self.errors: pd.DataFrame = errors """ Results of error metric valculations. Index increases sequentially by 1, columns contain the technique, scaling method, variables and @@ -120,23 +127,57 @@ def __init__( |1|Theil-Sen|Yeo-JohnsonScaling|x + a + b|1|0.98|...|0.01| |...|...|...|...|...|...|...|...| |55|Extra Trees|None|x|2|0.43|...|0.52| - """ + self.pred_vals: dict[str, dict[ str, dict[str, pd.DataFrame]]] = dict() + """ + """ + self.cached_error_length: int = self.errors.shape[0] def _sklearn_error_meta(self, err: Any, name: str, **kwargs): """ """ - idx = 0 + idx = self.cached_error_length + true = self.y.loc[ + :, self.target + ][self.y.loc[:, 'Fold'] == 'Validation'] for technique, scaling_techniques in self.models.items(): + if self.pred_vals.get(technique) is None: + self.pred_vals[technique] = dict() for scaling_technique, var_combos in scaling_techniques.items(): + if self.pred_vals[technique].get(scaling_technique) is None: + self.pred_vals[technique][scaling_technique] = dict() for vars, folds in var_combos.items(): + self.pred_vals[technique][scaling_technique][vars] = pd.DataFrame(index=true.index) + try: + if self.errors.loc[ + (self.errors['Technique'] == technique) & + (self.errors['Scaling Method'] == scaling_technique) & + (self.errors['Variables'] == vars) + ].loc[:, name].notna().any(axis=None): + continue + except KeyError: + pass for fold, pipe in folds.items(): - true = self.y.loc[ - :, self.target - ][self.y.loc[:, 'Fold'] == fold] - pred_raw = self.x.loc[true.index, :] - pred = pipe.predict(pred_raw) - error = err(true, pred, **kwargs) + if fold not in self.pred_vals[technique][scaling_technique][vars].columns: + pred_raw = self.x.loc[true.index, vars.split(' + ')] + if isinstance(pipe, Pipeline): + pipe_to_use = pipe + elif isinstance(pipe, Path): + with pipe.open('rb') as pkl: + pipe_to_use = pickle.load(pkl) + else: + continue + pred_no_ind = pipe_to_use.predict(pred_raw) + self.pred_vals[technique][scaling_technique][vars][fold] = pred_no_ind + try: + predicted = self.pred_vals[technique][scaling_technique][vars][fold].dropna() + error = err(true[predicted.index], predicted, **kwargs) + except ValueError as exc: + logger.warning(technique, scaling_technique, vars, fold) + for arg in exc.args: + logger.warning(arg) + error = np.nan + if idx not in self.errors.index: self.errors.loc[idx, 'Technique'] = technique self.errors.loc[ @@ -146,6 +187,17 @@ def _sklearn_error_meta(self, err: Any, name: str, **kwargs): self.errors.loc[idx, 'Fold'] = fold self.errors.loc[idx, name] = error idx = idx+1 + if idx not in self.errors.index: + self.errors.loc[idx, 'Technique'] = technique + self.errors.loc[ + idx, 'Scaling Method' + ] = scaling_technique + self.errors.loc[idx, 'Variables'] = vars + self.errors.loc[idx, 'Fold'] = 'All' + predicted = self.pred_vals[technique][scaling_technique][vars].mean(axis=1).dropna() + error = err(self.y.loc[predicted.index, self.target], predicted, **kwargs) + self.errors.loc[idx, name] = error + idx = idx+1 def explained_variance_score(self): """Calculate the explained variance score between the true values (y) @@ -350,6 +402,4 @@ def return_errors(self) -> pd.DataFrame: |Extra Trees|None|x|2|0.43|...|0.52| """ - return self.errors.set_index( - ['Technique', 'Scaling Method', 'Variables', 'Fold'] - ) + return self.errors