From 26fb3328527ef8537847f76390867f2a53b03db7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Sat, 9 Sep 2023 08:33:24 +0100 Subject: [PATCH] [ENH] residual double regressor - feature complete implementation (#59) This completes the implementation of `ResidualDouble`, in parts relying on the set of distributions implemented now - in parts, thanks to @Alex-JG3! The functionality is now on par with the `skpro` version 1 features and upgraded to the new interface. A comprehensive docstring with a formal summary of the algorithm has also been added. --- skpro/regression/residual.py | 220 +++++++++++++++++++++++++++++++---- 1 file changed, 200 insertions(+), 20 deletions(-) diff --git a/skpro/regression/residual.py b/skpro/regression/residual.py index 72f3552ad..6d41594e6 100644 --- a/skpro/regression/residual.py +++ b/skpro/regression/residual.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Residual regression - one regressor for mean, one for scale.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) @@ -8,41 +7,97 @@ import pandas as pd from sklearn import clone -from skpro.distributions.normal import Normal from skpro.regression.base import BaseProbaRegressor class ResidualDouble(BaseProbaRegressor): """Residual double regressor. - One regressor predicting the mean, and one the deviation from the mean. + Make a parametric probabilistic prediction using two tabular regressors, with + one tabular regressor predicting the mean, and one the deviation from the mean. - TODO - math description + The mean is predicted by ``estimator``. The residual is predicted by + ``estimator_resid``. The residual is transformed by ``residual_trafo``. + The predicted mean and residual are passed to a distribution specified by + ``distr_type``, and possibly ``distr_params``, ``distr_loc_scale_name``. + + The residuals predicted on the training data are used to fit + ``estimator_resid``. If ``cv`` is passed, the residuals are out-of-sample + according to ``cv``, otherwise in-sample. + + ``use_y_pred`` determines whether the predicted mean is used as a feature + in predicting the residual. + + A formal description of the algorithm follows. + + In ``fit``, given training data ``X``, ``y``: + + 1. Fit clone ``estimator_`` of ``estimator`` to predict ``y`` from ``X``, + i.e., ``fit(X, y)``. + 2. Predict mean label ``y_pred`` for ``X`` using a clone of ``estimator``. + If ``cv`` is ``None``, this is via plain ``estimator.predict(X)``. + If ``cv`` is not ``None``, out-of-sample predictions are obtained via ``cv``. + In this case, indices not appearing in ``cv`` are predicted in-sample. + 3. Compute residual ``resid`` as ``residual_trafo(y - y_pred)``. + If ``residual_trafo`` is a transformer, ``residual_trafo.fit_transform`` is used. + 4. Fit clone ``estimator_resid_`` of ``estimator_resid`` + to predict ``resid`` from ``X``, i.e., ``fit(X, resid)``. + If ``use_y_pred`` is ``True``, ``y_pred`` is used as a feature in predicting. + + In ``predict``, given test data ``X``: + + 1. Predict mean label ``y_pred`` for ``X`` using ``estimator_.predict(X)``. + 2. Return ``y_pred``. + + In ``predict_proba``, given test data ``X``: + + 1. Predict mean label ``y_pred`` for ``X`` using ``estimator_.predict(X)``. + 2. Predict residual ``resid`` for ``X`` using ``estimator_resid_.predict(X)``. + If ``use_y_pred`` is ``True``, ``y_pred`` is used as a feature in predicting. + 3. Predict distribution ``y_pred_proba`` for ``X`` as follows: + The location parameter is ``y_pred``. The scale parameter is ``resid``. + Further parameters can be specified via ``distr_params``. + 4. Return ``y_pred_proba``. Parameters ---------- - estimator : skpro estimator, BaseProbaRegressor descendant + estimator : sklearn regressor estimator predicting the mean or location - estimator_resid : skpro estimator, BaseProbaRegressor descendant, optional + estimator_resid : sklearn regressor estimator predicting the scale of the residual default = sklearn DummyRegressor(strategy="mean") - - TODO - add - estimator_resid : skpro estimator or dict of estimators with str keys residual_trafo : str, or transformer, default="absolute" determines the labels predicted by ``estimator_resid`` absolute = absolute residuals squared = squared residuals + if transformer, applies fit_transform to batch of signed residuals distr_type : str or BaseDistribution, default = "Normal" type of distribution to predict str options are "Normal", "Laplace", "Cauchy", "t" + distr_loc_scale_name : tuple of length two, default = ("loc", "scale") + names of the parameters in the distribution to use for location and scale + if ``distr_type`` is a string, this is overridden to the correct parameters + if ``distr_type`` is a BaseDistribution, this is used to determine the + location and scale parameters that the predictions are passed to + distr_params : dict, default = {} + parameters to pass to the distribution + must be valid parameters of ``distr_type``, if ``BaseDistribution`` + must be default or dict with key ``df``, if ``t`` distribution use_y_pred : bool, default=False whether to use the predicted location in predicting the scale of the residual cv : optional, sklearn cv splitter, default = None if passed, will be used to obtain out-of-sample residuals according to cv instead of in-sample residuals in ``fit`` of this estimator min_scale : float, default=1e-10 - minimum scale parameter if ``estimator_resid`` is an estimator (not dict) + minimum scale parameter. If smaller scale parameter is predicted by + ``estimator_resid``, will be clipped to this value + + Attributes + ---------- + estimator_ : sklearn regressor, clone of ``estimator`` + fitted estimator predicting the mean or location + estimator_resid_ : sklearn regressor, clone of ``estimator_resid`` + fitted estimator predicting the scale of the residual Example ------- @@ -64,13 +119,29 @@ class ResidualDouble(BaseProbaRegressor): _tags = {"capability:missing": True} - def __init__(self, estimator, estimator_resid=None, min_scale=1e-10): - + def __init__( + self, + estimator, + estimator_resid=None, + residual_trafo="absolute", + distr_type="Normal", + distr_loc_scale_name=None, + distr_params=None, + use_y_pred=False, + cv=None, + min_scale=1e-10, + ): self.estimator = estimator self.estimator_resid = estimator_resid + self.residual_trafo = residual_trafo + self.distr_type = distr_type + self.distr_loc_scale_name = distr_loc_scale_name + self.distr_params = distr_params + self.use_y_pred = use_y_pred + self.cv = cv self.min_scale = min_scale - super(ResidualDouble, self).__init__() + super().__init__() self.estimator_ = clone(estimator) @@ -81,6 +152,36 @@ def __init__(self, estimator, estimator_resid=None, min_scale=1e-10): else: self.estimator_resid_ = clone(estimator_resid) + def _predict_residuals_cv(self, X, y, cv, est): + """Predict out-of-sample residuals for y from X using cv. + + Parameters + ---------- + X : pandas DataFrame + feature instances to fit regressor to + y : pandas DataFrame, must be same length as X + labels to fit regressor to + cv : sklearn cv splitter + cv splitter to use for out-of-sample predictions + + Returns + ------- + y_pred : pandas DataFrame, same length as `X`, same columns as `y` in `fit` + labels predicted for `X` + """ + est = self.estimator_resid + method = "predict" + y_pred = y.copy() + + for tr_idx, tt_idx in cv.split(X): + X_train = X.iloc[tr_idx] + X_test = X.iloc[tt_idx] + y_train = y[tr_idx] + fitted_est = clone(est).fit(X_train, y_train) + y_pred[tt_idx] = getattr(fitted_est, method)(X_test) + + return y_pred + def _fit(self, X, y): """Fit regressor to training data. @@ -100,16 +201,36 @@ def _fit(self, X, y): """ est = self.estimator_ est_r = self.estimator_resid_ + residual_trafo = self.residual_trafo + cv = self.cv + use_y_pred = self.use_y_pred self._y_cols = y.columns y = y.values.flatten() est.fit(X, y) - resids = np.abs(y - est.predict(X)) + + if cv is None: + y_pred = est.predict(X) + else: + y_pred = self._predict_residuals_cv(X, y, cv, est) + + if residual_trafo == "absolute": + resids = np.abs(y - y_pred) + elif residual_trafo == "squared": + resids = (y - y_pred) ** 2 + else: + resids = residual_trafo.fit_transform(y - y_pred) resids = resids.flatten() - est_r.fit(X, resids) + if use_y_pred: + y_ix = {"index": X.index, "columns": self._y_cols} + X_r = pd.concat([X, pd.DataFrame(y_pred, **y_ix)], axis=1) + else: + X_r = X + + est_r.fit(X_r, resids) return self @@ -160,19 +281,65 @@ def _predict_proba(self, X): """ est = self.estimator_ est_r = self.estimator_resid_ + use_y_pred = self.use_y_pred + distr_type = self.distr_type + distr_loc_scale_name = self.distr_loc_scale_name + distr_params = self.distr_params min_scale = self.min_scale + if distr_params is None: + distr_params = {} + + # predict location - this is the same as in _predict y_pred_loc = est.predict(X) y_pred_loc = y_pred_loc.reshape(-1, 1) - y_pred_scale = est_r.predict(X) + # predict scale + # if use_y_pred, use predicted location as feature + if use_y_pred: + y_ix = {"index": X.index, "columns": self._y_cols} + X_r = pd.concat([X, pd.DataFrame(y_pred_loc, **y_ix)], axis=1) + # if not use_y_pred, use only original features + else: + X_r = X + + y_pred_scale = est_r.predict(X_r) y_pred_scale = y_pred_scale.clip(min=min_scale) y_pred_scale = y_pred_scale.reshape(-1, 1) - y_pred = Normal( - mu=y_pred_loc, sigma=y_pred_scale, index=X.index, columns=self._y_cols - ) + # create distribution with predicted scale and location + # we deal with string distr_types by getting class and param names + if distr_type == "Normal": + from skpro.distributions.normal import Normal + + distr_type = Normal + distr_loc_scale_name = ("mu", "sigma") + elif distr_type == "Laplace": + from skpro.distributions.laplace import Laplace + + distr_type = Laplace + distr_loc_scale_name = ("mu", "scale") + elif distr_type in ["Cauchy", "t"]: + from skpro.distributions.t import TDistribution + + distr_type = TDistribution + distr_loc_scale_name = ("mu", "sigma") + + # collate all parameters for the distribution constructor + # distribution params, if passed + params = distr_params + # row/column index + ix = {"index": X.index, "columns": self._y_cols} + params.update(ix) + # location and scale + loc_scale = { + distr_loc_scale_name[0]: y_pred_loc, + distr_loc_scale_name[1]: y_pred_scale, + } + params.update(loc_scale) + # create distribution and return + y_pred = distr_type(**params) return y_pred @classmethod @@ -195,12 +362,25 @@ def get_test_params(cls, parameter_set="default"): """ from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression + from sklearn.model_selection import KFold params1 = {"estimator": RandomForestRegressor()} params2 = { "estimator": LinearRegression(), "estimator_resid": RandomForestRegressor(), "min_scale": 1e-7, + "residual_trafo": "squared", + "use_y_pred": True, + "distr_type": "Laplace", + } + params3 = { + "estimator": LinearRegression(), + "estimator_resid": RandomForestRegressor(), + "min_scale": 1e-6, + "use_y_pred": True, + "distr_type": "t", + "distr_params": {"df": 3}, + "cv": KFold(n_splits=3), } - return [params1, params2] + return [params1, params2, params3]