Skip to content

Commit

Permalink
[ENH] adds bootstrap probabilistic regressor (#39)
Browse files Browse the repository at this point in the history
This PR adds a bootstrap probabilistic regressor that turns any sklearn
regressor into a probabilistic one via the classical bootstrap.

Also makes a minor upgrade to the `BaseProbaRegressor` class: if
`_predict` is not implemented, it defaults to
`_predict_proba(X).mean()`.
  • Loading branch information
fkiraly authored Aug 26, 2023
1 parent bc3cf0a commit ef89c13
Show file tree
Hide file tree
Showing 2 changed files with 205 additions and 1 deletion.
10 changes: 9 additions & 1 deletion skpro/regression/base/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,15 @@ def _predict(self, X):
y : pandas DataFrame, same length as `X`
labels predicted for `X`
"""
raise NotImplementedError
implements_proba = self._has_implementation_of("_predict_proba")

if not implements_proba:
raise NotImplementedError

if implements_proba:
pred_proba = self._predict_proba(X=X)
pred_mean = pred_proba.mean()
return pred_mean

def predict_proba(self, X):
"""Predict distribution over labels for data from features.
Expand Down
196 changes: 196 additions & 0 deletions skpro/regression/bootstrap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
# -*- coding: utf-8 -*-
"""Probabilistic regression by bootstrap."""

__author__ = ["fkiraly"]
__all__ = ["BootstrapRegressor"]

import numpy as np
import pandas as pd
from sklearn import clone

from skpro.distributions.empirical import Empirical
from skpro.regression.base import BaseProbaRegressor


class BootstrapRegressor(BaseProbaRegressor):
"""Bootstrap ensemble of a tabular regressor.
Fits ``n_estimators`` clones of an skpro regressor on
datasets which are bootstrap sub-samples, i.e.,
independent row samples with replacement.
On ``predict_proba``, an empirical distribution with the bootstrap
sample is returned.
The estimator allows to choose sample sizes for instances, variables,
and whether sampling is with or without replacement.
Direct generalization of ``sklearn``'s ``BaggingClassifier``
to the probabilistic regrsesion task.
Parameters
----------
estimator : sklearn regressor
regressor to use in the bootstrap
n_bootstrap_samples : int, default=100
The number of bootstrap samples drawn
If int, then indicates number of instances precisely
Note: this is not the same as the size of each bootstrap sample.
The size of the bootstrap sample is always equal to X.
random_state : int, RandomState instance or None, optional (default=None)
If int, ``random_state`` is the seed used by the random number generator;
If ``RandomState`` instance, ``random_state`` is the random number generator;
If None, the random number generator is the ``RandomState`` instance used
by ``np.random``.
Attributes
----------
estimators_ : list of of skpro regressors
clones of regressor in `estimator` fitted in the ensemble
Examples
--------
>>> from skpro.regression.bootstrap import BootstrapRegressor
>>> from sklearn.linear_model import LinearRegression
>>> from sklearn.datasets import load_diabetes
>>> from sklearn.model_selection import train_test_split
>>>
>>> X, y = load_diabetes(return_X_y=True, as_frame=True)
>>> X_train, X_test, y_train, y_test = train_test_split(X, y)
>>>
>>> reg_tabular = LinearRegression()
>>>
>>> reg_proba = BootstrapRegressor(reg_tabular)
>>> reg_proba.fit(X_train, y_train)
BootstrapRegressor(...)
>>> y_pred = reg_proba.predict_proba(X_test)
"""

_tags = {"capability:missing": True}

def __init__(
self,
estimator,
n_bootstrap_samples=100,
random_state=None,
):
self.estimator = estimator
self.n_bootstrap_samples = n_bootstrap_samples
self.random_state = random_state

super().__init__()

# todo: find the equivalent tag in sklearn for missing data handling
# tags_to_clone = ["capability:missing"]
# self.clone_tags(estimator, tags_to_clone)

def _fit(self, X, y):
"""Fit regressor to training data.
Writes to self:
Sets fitted model attributes ending in "_".
Parameters
----------
X : pandas DataFrame
feature instances to fit regressor to
y : pandas DataFrame, must be same length as X
labels to fit regressor to
Returns
-------
self : reference to self
"""
estimator = self.estimator
n_bootstrap_samples = self.n_bootstrap_samples
np.random.seed(self.random_state)

inst_ix = X.index
n = len(inst_ix)

self.estimators_ = []
self._cols = y.columns

for _i in range(n_bootstrap_samples):
esti = clone(estimator)
row_iloc = pd.RangeIndex(n)
row_ss = _random_ss_ix(row_iloc, size=n, replace=True)
inst_ix_i = inst_ix[row_ss]

Xi = X.loc[inst_ix_i]
Xi = Xi.reset_index(drop=True)

yi = y.loc[inst_ix_i].reset_index(drop=True)

self.estimators_ += [esti.fit(Xi, yi)]

return self

def _predict_proba(self, X) -> np.ndarray:
"""Predict distribution over labels for data from features.
State required:
Requires state to be "fitted".
Accesses in self:
Fitted model attributes ending in "_"
Parameters
----------
X : pandas DataFrame, must have same columns as X in `fit`
data to predict labels for
Returns
-------
y : skpro BaseDistribution, same length as `X`
labels predicted for `X`
"""
cols = self._cols
y_preds = [est.predict(X) for est in self.estimators_]

def _coerce_df(x):
if not isinstance(x, pd.DataFrame):
x = pd.DataFrame(x, columns=cols, index=X.index)
return x

y_preds = [_coerce_df(x) for x in y_preds]

y_pred_df = pd.concat(y_preds, axis=0, keys=range(len(y_preds)))

y_proba = Empirical(y_pred_df)
return y_proba

@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.
Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return `"default"` set.
Returns
-------
params : dict or list of dict, default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test instance, i.e.,
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
`create_test_instance` uses the first (or only) dictionary in `params`
"""
from sklearn.linear_model import LinearRegression

params1 = {"estimator": LinearRegression()}
params2 = {
"estimator": LinearRegression(),
"n_bootstrap_samples": 10,
}

return [params1, params2]


def _random_ss_ix(ix, size, replace=True):
"""Randomly uniformly sample indices from a list of indices."""
a = range(len(ix))
ixs = ix[np.random.choice(a, size=size, replace=replace)]
return ixs

0 comments on commit ef89c13

Please sign in to comment.