diff --git a/.all-contributorsrc b/.all-contributorsrc index dbe365847..af77c211e 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -176,6 +176,17 @@ "maintenance" ] }, + { + "login": "meraldoantonio", + "name": "Meraldo Antonio", + "avatar_url": "https://avatars.githubusercontent.com/u/37468543?v=4", + "profile": "https://github.com/meraldoantonio", + "contributions": [ + "bug", + "code", + "doc" + ] + }, { "login": "szepeviktor", "name": "Viktor Szépe", diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ddcf7616d..2f4dbf106 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -103,7 +103,7 @@ jobs: run: make test - name: Publish code coverage - uses: codecov/codecov-action@v4 + uses: codecov/codecov-action@v5 run-tests-all-extras: needs: code-quality @@ -142,4 +142,4 @@ jobs: run: make test - name: Publish code coverage - uses: codecov/codecov-action@v4 + uses: codecov/codecov-action@v5 diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 000000000..d024d0c1f --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,14 @@ +# The file specifies framework level core developers for automated review requests +# +# Note: historically, this file has been used to maintain a list of +# algorithm maintainers as specified in GOVERNANCE.md. +# This is no longer the case, algorithm maintainers are now +# specified directly in the estimator, +# in the "maintainers" tag of the respective scikit-base object. +# +# Algorithm maintainers are programmatically queryable +# via Estimator.get_class_tag("maintainers"). +# Further lookup such as "which algorithms does M maintain" +# can be carried out using registry.all_estimators + +* @achieveordie @benheid @fkiraly @fnhirwa @geetu040 @pranavvp16 @sairevanth25 @XinyuWuu @yarnabrina diff --git a/README.md b/README.md index ba57a1b26..371c6f284 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -:rocket: **Version 2.7.0 out now!** [Read the release notes here.](https://skpro.readthedocs.io/en/latest/changelog.html). +:rocket: **Version 2.8.0 out now!** [Read the release notes here.](https://skpro.readthedocs.io/en/latest/changelog.html). `skpro` is a library for supervised probabilistic prediction in python. It provides `scikit-learn`-like, `scikit-base` compatible interfaces to: diff --git a/docs/source/_static/switcher.json b/docs/source/_static/switcher.json index 904bbdd27..71963dad0 100644 --- a/docs/source/_static/switcher.json +++ b/docs/source/_static/switcher.json @@ -5,7 +5,12 @@ "url": "https://skpro.readthedocs.io/en/latest/" }, { - "name": "2.7.0 (stable)", + "name": "2.8.0 (stable)", + "version": "stable", + "url": "https://skpro.readthedocs.io/en/v2.8.0/" + }, + { + "name": "2.7.0", "version": "stable", "url": "https://skpro.readthedocs.io/en/v2.7.0/" }, diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index ffda1181f..bf8ad626a 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -15,6 +15,46 @@ You can also subscribe to ``skpro``'s For planned changes and upcoming releases, see roadmap in the `issue tracker `_. +[2.8.0] - 2024-11-17 +==================== + +Feature and maintenance release. + +Dependency changes +~~~~~~~~~~~~~~~~~~ + +* ``scikit-base`` bounds have been updated to ``>=0.6.1,<0.13.0``. +* ``pymc`` is now a soft dependency, for probabilistic regressors. +* ``polars`` (data container soft dependency) bounds have been updated to ``<1.14.0``. + +Enhancements +~~~~~~~~~~~~ + +* [ENH] Creating a new Bayesian Regressor with ``pymc`` as a backend (:pr:`358`) :user:`meraldoantonio` +* [ENH] add suite test for docstring and ``get_test_params`` coverage (:pr:`482`) :user:`fkiraly` +* [ENH] Synchronize dependency checker with ``sktime`` counterpart (:pr:`490`) :user:`meraldoantonio` + +Maintenance +~~~~~~~~~~~ + +* [MNT] fix failing ``code-quality`` CI step (:pr:`483`) :user:`fkiraly` +* [MNT] [Dependabot](deps): Update ``scikit-base`` requirement from ``<0.12.0,>=0.6.1`` to ``>=0.6.1,<0.13.0`` (:pr:`483`) :user:`dependabot[bot]` +* [MNT] [Dependabot](deps): Update ``sphinx-gallery`` requirement from ``<0.18.0`` to ``<0.19.0`` (:pr:`481`) :user:`dependabot[bot]` +* [MNT] [Dependabot](deps): Update ``sphinx-issues`` requirement from ``<5.0.0`` to ``<6.0.0`` (:pr:`484`) :user:`dependabot[bot]` +* [MNT] [Dependabot](deps): Update ``polars`` requirement from ``<1.10.0`` to ``<1.14.0`` (:pr:`491`) :user:`dependabot[bot]` +* [MNT] [Dependabot](deps): Bump codecov/codecov-action from ``4`` to ``5`` (:pr:`494`) :user:`dependabot[bot]` + +Documentation +~~~~~~~~~~~~~ + +* [DOC] in docstring, rename ``Example`` to ``Examples`` sections (:pr:`487`) :user:`fkiraly` + +Contributors +~~~~~~~~~~~~ + +:user:`fkiraly`, +:user:`meraldoantonio` + [2.7.0] - 2024-10-08 ==================== diff --git a/pyproject.toml b/pyproject.toml index bfe543c35..72173f8b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "skpro" -version = "2.7.0" +version = "2.8.0" description = "A unified framework for tabular probabilistic regression, time-to-event prediction, and probability distributions in python" authors = [ {name = "skpro developers", email = "info@sktime.net"}, @@ -42,7 +42,7 @@ dependencies = [ "numpy>=1.21.0,<2.2", "pandas>=1.1.0,<2.3.0", "packaging", - "scikit-base>=0.6.1,<0.12.0", + "scikit-base>=0.6.1,<0.13.0", "scikit-learn>=0.24.0,<1.6.0", "scipy<2.0.0,>=1.2.0", ] @@ -57,6 +57,7 @@ all_extras = [ "ngboost<0.6.0; python_version < '3.13'", "polars<1.14.0", "pyarrow<14.0.0; python_version < '3.12'", + "pymc; python_version < '3.13'", "scikit-survival<0.24.0; python_version < '3.13'", "statsmodels>=0.12.1", ] diff --git a/skpro/__init__.py b/skpro/__init__.py index 62dc671ac..acc60bcc2 100644 --- a/skpro/__init__.py +++ b/skpro/__init__.py @@ -1,6 +1,6 @@ """skpro.""" -__version__ = "2.7.0" +__version__ = "2.8.0" __all__ = ["show_versions"] diff --git a/skpro/base/_base.py b/skpro/base/_base.py index 08c9f78ca..093ac4550 100644 --- a/skpro/base/_base.py +++ b/skpro/base/_base.py @@ -28,30 +28,6 @@ class BaseObject(_CommonTags, _BaseObject): def __init__(self): super().__init__() - import sys - from warnings import warn - - from packaging.specifiers import SpecifierSet - - py39_or_higher = SpecifierSet(">=3.9") - sys_version = sys.version.split(" ")[0] - - # todo 2.8.0 - check whether python 3.8 eol is reached. - # If yes, remove this msg. - if sys_version not in py39_or_higher: - warn( - f"From skpro 2.5.0, skpro requires Python version >=3.9, " - f"but found {sys_version}. " - "The package can still be installed, until 3.8 end of life " - "is reached, " - "but some functionality may not work as test coverage is dropped." - "Kindly note for context: python 3.8 will reach end of life " - "in October 2024, and multiple skpro core dependencies, " - "including scikit-learn, have already dropped support for 3.8. ", - category=DeprecationWarning, - stacklevel=2, - ) - class BaseEstimator(_CommonTags, _BaseEstimator): """Base class for fittable objects.""" diff --git a/skpro/regression/bayesian.py b/skpro/regression/bayesian.py new file mode 100644 index 000000000..f5648702f --- /dev/null +++ b/skpro/regression/bayesian.py @@ -0,0 +1,581 @@ +""" +Simple Bayesian Linear Regressor. + +Bayesian Linear Regression defined with user-specified priors or defaults for slopes, +intercept, and noise; implemented using the pymc backend. +""" + +# copyright: skpro developers +__author__ = ["meraldoantonio"] + +from skpro.regression.base import BaseProbaRegressor +from skpro.utils.validation._dependencies import _check_soft_dependencies + + +class BayesianLinearRegressor(BaseProbaRegressor): + """ + Bayesian Linear Regression class with MCMC sampling. + + Defined with user-specified priors or defaults for slopes, intercept, + and noise; implemented using the pymc backend. + + Parameters + ---------- + prior_config : Dictionary, optional + Dictionary of priors + Class-default defined by default_prior_config method. + sampler_config : Dictionary, optional + Dictionary of parameters that initialise sampler configuration. + Class-default defined by default_sampler_config method. + + Example + ------- + >>> from skpro.regression.bayesian import BayesianLinearRegressor + >>> from sklearn.datasets import load_diabetes # doctest: +SKIP + >>> from sklearn.model_selection import train_test_split # doctest: +SKIP + >>> X, y = load_diabetes(return_X_y=True, as_frame=True) # doctest: +SKIP + >>> X_train, X_test, y_train, y_test = train_test_split(X, y) # doctest: +SKIP + + >>> bayes_model = BayesianLinearRegressor() # doctest: +SKIP + >>> bayes_model.fit(X_train, y_train) # doctest: +SKIP + >>> y_test_pred_proba = bayes_model.predict_proba(X_test) # doctest: +SKIP + >>> y_test_pred = bayes_model.predict(X_test) # doctest: +SKIP + """ + + _tags = { + # packaging info + # -------------- + "authors": ["meraldoantonio"], + "python_version": ">=3.10", + "python_dependencies": [ + "pymc", + "pymc_marketing", + "arviz>=0.18.0", + ], + # estimator tags + # -------------- + "capability:multioutput": False, # can the estimator handle multi-output data? + "capability:missing": True, # can the estimator handle missing data? + "X_inner_mtype": "pd_DataFrame_Table", # type seen in internal _fit, _predict + "y_inner_mtype": "pd_DataFrame_Table", # type seen in internal _fit + } + + def __init__(self, prior_config=None, sampler_config=None): + if sampler_config is None: + sampler_config = {} + if prior_config is None: + prior_config = {} # configuration for priors + self.sampler_config = {**self.default_sampler_config, **sampler_config} + self.prior_config = {**self.default_prior_config, **prior_config} + self.model = None # generated during fitting + self.idata = None # generated during fitting + self._predict_done = False # a flag indicating if a prediction has been done + + print( # noqa: T201 + f"instantiated {self.__class__.__name__} with the following priors:" + ) + + for key, value in self.prior_config.items(): + print(f" - {key}: {value}") # noqa: T201 + + super().__init__() + + @property + def default_prior_config(self): + """Return a dictionary of prior defaults.""" + from pymc_marketing.prior import Prior + + print( # noqa: T201 + "The model assumes that the intercept and slopes are independent. \n\ + Modify the model if this assumption doesn't apply!" + ) + default_prior_config = { + "intercept": Prior( + "Normal", mu=0, sigma=100 + ), # Weakly informative normal prior with large sigma + "slopes": Prior( + "Normal", mu=0, sigma=100, dims=("pred_id",) + ), # Same for slopes + "noise_var": Prior( + "HalfCauchy", beta=5 + ), # Weakly informative Half-Cauchy prior for noise variance + } + return default_prior_config + + @property + def default_sampler_config(self): + """Return a class default sampler configuration dictionary.""" + default_sampler_config = { + "draws": 1000, + "tune": 1000, + "chains": 2, + "target_accept": 0.95, + "random_seed": 123, + "progressbar": True, + } + return default_sampler_config + + def _fit(self, X, y): + """Fit regressor to training data. + + Writes to self: + Sets fitted model attributes ending in "_". + + Parameters + ---------- + X : pandas DataFrame + feature instances to fit regressor to + y : pandas DataFrame, must be same length as X + labels to fit regressor to + + Returns + ------- + self : reference to self + """ + import warnings + + import pandas as pd + import pymc as pm + + assert len(y.columns) == 1, "y must have only one column!" + self._X = X + self._y = y + self._y_vals = y.values[ + :, 0 + ] # we need a 1-dimensional array for compatibility with pymc + + # Model construction and posterior sampling + with pm.Model(coords={"obs_id": X.index, "pred_id": X.columns}) as self.model: + # Mutable data containers for X and y + X_data = pm.Data("X", X, dims=("obs_id", "pred_id")) + y_data = pm.Data("y", self._y_vals, dims=("obs_id")) + + # Priors for model parameters, taken from self.prior_config + self.intercept = self.prior_config["intercept"].create_variable("intercept") + self.slopes = self.prior_config["slopes"].create_variable("slopes") + self.noise_var = self.prior_config["noise_var"].create_variable("noise_var") + self.noise = pm.Deterministic("noise", self.noise_var**0.5) + + # Expected value of the target variable + self.mu = pm.Deterministic( + "mu", self.intercept + pm.math.dot(X_data, self.slopes) + ) + + # Likelihood of observations + y_obs = pm.Normal( # noqa: F841 + "y_obs", mu=self.mu, sigma=self.noise, observed=y_data, dims=("obs_id") + ) + + # Constructing the posterior + self.idata = pm.sample(**self.sampler_config) + + # Incorporation of training_data as a new group in self.idata + training_data = pd.concat([X, y], axis=1) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + category=UserWarning, + ) + self.idata.add_groups(training_data=training_data.to_xarray()) + return self + + def visualize_model(self, **kwargs): + """Use Graphviz to visualize the model flow.""" + _check_soft_dependencies( + "graphviz", msg="You need to install Graphviz to use this method!" + ) + import pymc as pm + + assert self._is_fitted, "You need to fit the model before visualizing it!" + + return pm.model_to_graphviz(self.model, **kwargs) + + def _sample_dataset(self, group_name, return_type=None): + """ + General method to sample from a specified group in the idata object. + + Extracts samples from a specified group (e.g., 'prior') in the idata object and + returns them in the required format + + Parameters + ---------- + group_name : str + The name of the group in the idata object to sample from (e.g., 'prior'). + + return_type : str or None, optional (default=None) + The format in which to return the sampled distributions. + Accepted values are: + - "xarray": Returns an xarray.Dataset + - "numpy": Returns a dictionary of NumPy arrays + - "dataframe": Returns a pandas DataFrame + - "skpro": Returns an `Empirical` distribution from the skpro library. + - None: Does not return any sampled data but performs the sampling + and updates the 'idata' attribute. + + Returns + ------- + xarray.Dataset or dict or pd.DataFrame or skpro.distributions.Empirical or None + + The sampled distributions in the specified format, + or None if return_type is None. + """ + import pandas as pd + + # Validate the return_type + assert return_type in [ + "xarray", + "numpy", + "dataframe", + "skpro", + None, + ], "return_type must be one of 'xarray', 'numpy', 'dataframe', 'skpro', or None" + + # Validate that the group_name exists in idata + assert hasattr( + self.idata, group_name + ), f"{group_name} group does not exist in the idata object." + + # Get the specified group from idata + group = getattr(self.idata, group_name) + # prediction-specific groups which focus on posterior predictive + is_predictive = group_name in ["predictions", "posterior_predictive"] + # as opposed to ["prior", "posterior"] which focus on prior/posterior + + if is_predictive: + variables = ["y_obs"] + else: + variables = ["intercept", "slopes", "noise_var", "noise"] + + if return_type is None: + return None + elif return_type == "xarray": + return group + else: + data_dict = {} + + for var in variables: + # Check if the variable has a `pred_id` dimension + if var in group and "pred_id" in group[var].dims: + # Iterate through each feature (e.g., 'feature1', 'feature2') + for feature in group[var].pred_id.values: + # Select the slope for the current feature and flatten it + feature_key = f"{var}_{feature}" + data_dict[feature_key] = ( + group[var] + .sel(pred_id=feature) + .stack({"sample": ("chain", "draw")}) + .values.squeeze() + ) + else: + if var in group: + data_dict[var] = ( + group[var] + .stack({"sample": ("chain", "draw")}) + .values.squeeze() + ) + + if return_type == "numpy": + return data_dict + + elif return_type == "dataframe": + if is_predictive: + return pd.DataFrame(data_dict["y_obs"]).T + else: + return pd.DataFrame(data_dict) + + elif return_type == "skpro": + from skpro.distributions import Empirical + + if not is_predictive: + df = pd.DataFrame(data_dict) + reshaped_df = df.stack() + reshaped_df = reshaped_df.reset_index(name="value") + reshaped_df.set_index(["level_0", "level_1"], inplace=True) + reshaped_df.index.names = ["obs_id", "variable"] + return Empirical(spl=reshaped_df) + else: + # Extract posterior predictive distributions as an xarray DataArray + pred_proba_xarray = group["y_obs"] + + # Convert data to pd.DataFrame and format it appropriately for + # subsequent conversion into a skpro Empirical distribution + pred_proba_df = pred_proba_xarray.to_dataframe() + pred_proba_df = pred_proba_df.reset_index() + + # Create a new 'sample_id' column by + # combining the 'chain' and 'draw' columns + pred_proba_df["sample_id"] = ( + pred_proba_df["chain"] * self.sampler_config["draws"] + + pred_proba_df["draw"] + ) + pred_proba_df = pred_proba_df[["obs_id", "sample_id", "y_obs"]] + pred_proba_df = pred_proba_df.rename( + columns={"y_obs": self._y.columns[0]} + ) + pred_proba_df = pred_proba_df.set_index(["sample_id", "obs_id"]) + + # Convert data to skpro Empirical distribution + pred_proba_dist = Empirical( + spl=pred_proba_df, columns=self._y.columns + ) + return pred_proba_dist + + def _get_dataset_summary(self, group_name, var_names=None, **kwargs): + """ + Get the summary statistics of a specified group in the idata object. + + Parameters + ---------- + group_name : str + The name of the group in the idata object to summarize (e.g., 'prior'). + + var_names : list, optional (default=None) + A list of variable names to include in the summary. + If None, all variables in the group are included. + + **kwargs : + Additional keyword arguments to pass to `arviz.summary`. + + Returns + ------- + az.data.inference_data.Summary + The summary statistics for the specified group and variables. + """ + import arviz as az + + # Check if the specified group exists in the idata object + if group_name not in self.idata.groups(): + if group_name == "prior": + self.sample_prior() + elif group_name == "posterior": + self.sample_posterior() + else: + raise ValueError( + f"Group '{group_name}' does not exist in the idata object." + ) + + # Get the summary statistics with optional kwargs + return az.summary( + getattr(self.idata, group_name), var_names=var_names, **kwargs + ) + + def sample_prior(self, return_type=None): + """ + Sample from the prior distributions. + + Samples from the prior distributions and returns + them in the required format + + If return_type is None, the method updates the 'idata' attribute + by adding the 'prior' group but does not return any samples. + + return_type : str or None, optional (default=None) + The format in which to return the sampled distributions. + Accepted values are: + - "xarray": Returns an xarray.Dataset + - "numpy": Returns a dictionary of NumPy arrays + - "dataframe": Returns a pandas DataFrame + - "skpro": Returns an `Empirical` distribution from the skpro library. + - None: Does not return any sampled data but performs the sampling + and updates the 'idata' attribute. + + Returns + ------- + xarray.Dataset or dict or pd.DataFrame or skpro.distributions.Empirical or None + The sampled distributions in the specified format, + or None if return_type is None. + """ + import pymc as pm + + assert ( + self.is_fitted + ), "Model needs to be fitted before you can sample from prior" + + with self.model: + # if we've previously used the model for prediction, + # we need to reset the reference of 'X' to X used for training + if self._predict_done: + pm.set_data( + {"X": self._X}, + coords={"obs_id": self._X.index, "pred_id": self._X.columns}, + ) + self.idata.extend( + pm.sample_prior_predictive( + samples=self.sampler_config["draws"], + random_seed=self.sampler_config["random_seed"], + ) + ) # todo: the keyword 'samples' will be changed to 'draws' + # in pymc 5.16 + + return self._sample_dataset( + group_name="prior", + return_type=return_type, + ) + + def get_prior_summary(self, **kwargs): + """ + Get the summary statistics of prior distributions. + + Parameters + ---------- + **kwargs : + Additional keyword arguments to pass to `arviz.summary`. + + Returns + ------- + az.data.inference_data.Summary + The summary statistics for the prior distributions. + """ + return self._get_dataset_summary( + group_name="prior", + var_names=["intercept", "slopes", "noise_var", "noise"], + **kwargs, + ) + + def sample_posterior(self, return_type=None): + """ + Sample from the posterior distributions. + + Samples from the posterior distributions and returns + them in the required format + + If return_type is None, the method updates the 'idata' attribute + by adding the 'posterior' group but does not return any samples. + + return_type : str or None, optional (default="xarray") + The format in which to return the sampled distributions. + Accepted values are: + - "xarray": Returns an xarray.Dataset + - "numpy": Returns a dictionary of NumPy arrays + - "dataframe": Returns a pandas DataFrame + - "skpro": Returns an `Empirical` distribution from the skpro library. + - None: Does not return any sampled data but performs the sampling + and updates the 'idata' attribute. + + Returns + ------- + xarray.Dataset or dict or pd.DataFrame or skpro.distributions.Empirical or None + The sampled distributions in the specified format, + or None if return_type is None. + """ + assert ( + self.is_fitted + ), "The model must be fitted before posterior can be returned." + return self._sample_dataset( + group_name="posterior", + return_type=return_type, + ) + + def get_posterior_summary(self, **kwargs): + """ + Get the summary statistics of the posterior distributions. + + Parameters + ---------- + **kwargs : + Additional keyword arguments to pass to `arviz.summary`. + + Returns + ------- + az.data.inference_data.Summary + The summary statistics for the posterior distributions. + """ + return self._get_dataset_summary( + group_name="posterior", + var_names=["intercept", "slopes", "noise_var", "noise"], + **kwargs, + ) + + def sample_in_sample_posterior_predictive(self, return_type=None): + """Perform in-sample predictions and sample from it.""" + import pymc as pm + + with self.model: + # if we've previously used the model for prediction, + # we need to reset the reference of 'X' to X_train (i.e. self._X) + if self._predict_done: + pm.set_data( + {"X": self._X}, + coords={"obs_id": self._X.index, "pred_id": self._X.columns}, + ) + self.idata.extend( + pm.sample_posterior_predictive(self.idata, predictions=False) + ) + + return self._sample_dataset( + group_name="posterior_predictive", return_type=return_type + ) + + def plot_ppc(self, **kwargs): + """Plot the posterior predictive check.""" + import arviz as az + + if "posterior_predictive" not in self.idata: + self.sample_in_sample_posterior_predictive() + + return az.plot_ppc(self.idata, **kwargs) + + def _predict_proba(self, X): + """ + Predict distribution over labels for data from features. + + State required: + Requires state to be "fitted". + + Accesses in self: + Fitted model attributes ending in "_" + + Parameters + ---------- + X : pandas DataFrame, must have same columns as X in `fit` + data to predict labels for + + Returns + ------- + pred_proba_dist : skpro BaseDistribution, same length as `X` + labels predicted for `X` + """ + import pymc as pm + + with self.model: + if "predictions" in self.idata.groups(): + del self.idata.predictions + + # Set the X to be the new 'X' variable and then sample posterior predictive + pm.set_data({"X": X}, coords={"obs_id": X.index, "pred_id": X.columns}) + self.idata.extend( + pm.sample_posterior_predictive( + self.idata, + predictions=True, + ) + ) + self._predict_done = True # a flag indicating prediction has been done + + return self._sample_dataset(group_name="predictions", return_type="skpro") + + # todo: return default parameters, so that a test instance can be created + # required for automated unit and integration testing of estimator + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + + Returns + ------- + params : dict or list of dict, default = {} + Parameters to create testing instances of the class + Each dict are parameters to construct an "interesting" test instance, i.e., + `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. + `create_test_instance` uses the first (or only) dictionary in `params` + """ + from pymc_marketing.prior import Prior + + params1 = {} + params2 = {"prior_config": {"intercept": Prior("Normal", mu=0, sigma=10)}} + + return [params1, params2] diff --git a/skpro/tests/_config.py b/skpro/tests/_config.py index 31c08b3fe..0775ff69f 100644 --- a/skpro/tests/_config.py +++ b/skpro/tests/_config.py @@ -2,4 +2,10 @@ # list of str, names of estimators to exclude from testing # WARNING: tests for these estimators will be skipped -EXCLUDE_ESTIMATORS = ["DummySkipped"] +EXCLUDE_ESTIMATORS = [ + "DummySkipped", + "ClassName", # exclude classes from extension templates +] + + +EXCLUDED_TESTS = {"GLMRegressor": ["test_online_update"]} diff --git a/skpro/tests/test_all_estimators.py b/skpro/tests/test_all_estimators.py index f06263c6c..1874c9166 100644 --- a/skpro/tests/test_all_estimators.py +++ b/skpro/tests/test_all_estimators.py @@ -13,6 +13,7 @@ from skbase.testing.utils.inspect import _get_args from skpro.registry import OBJECT_TAG_LIST, all_objects +from skpro.tests._config import EXCLUDE_ESTIMATORS, EXCLUDED_TESTS from skpro.tests.scenarios.scenarios_getter import retrieve_scenarios from skpro.tests.test_switch import run_test_for_class from skpro.utils.deep_equals import deep_equals @@ -35,7 +36,12 @@ class PackageConfig: # list of object types (class names) to exclude # expected type: list of str, str are class names - exclude_objects = ["ClassName"] # exclude classes from extension templates + exclude_objects = EXCLUDE_ESTIMATORS + + # list of tests to exclude + # expected type: dict of lists, key:str, value: List[str] + # keys are class names of estimators, values are lists of test names to exclude + excluded_tests = EXCLUDED_TESTS # list of valid tags # expected type: list of str, str are tag names diff --git a/skpro/utils/_maint/_show_versions.py b/skpro/utils/_maint/_show_versions.py index 542c5b9bd..cc2c621ce 100644 --- a/skpro/utils/_maint/_show_versions.py +++ b/skpro/utils/_maint/_show_versions.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 -u # License: BSD 3 clause - """Utility methods to print system info for debugging. -adapted from :func:`sklearn.show_versions` +adapted from +:func: `sklearn.show_versions` """ __author__ = ["mloning", "fkiraly"] @@ -15,8 +15,7 @@ def _get_sys_info(): - """ - System information. + """System information. Return ------ @@ -37,28 +36,33 @@ def _get_sys_info(): # dependencies to print versions of, by default DEFAULT_DEPS_TO_SHOW = [ "pip", - "skpro", - "skbase", + "sktime", "sklearn", + "skbase", "numpy", "scipy", "pandas", "matplotlib", "joblib", "numba", - "tabulate", - "uncertainties", + "statsmodels", + "pmdarima", + "statsforecast", + "tsfresh", + "tslearn", + "torch", + "tensorflow", + "tensorflow_probability", ] def _get_deps_info(deps=None): - """ - Overview of the installed version of main dependencies. + """Overview of the installed version of main dependencies. Parameters ---------- deps : optional, list of strings with import names - if None, behaves as deps = ["skpro"] + if None, behaves as deps = ["sktime"] Returns ------- @@ -68,10 +72,10 @@ def _get_deps_info(deps=None): of the import as present in the current python environment """ if deps is None: - deps = ["skpro"] + deps = ["sktime"] def get_version(module): - return module.__version__ + return getattr(module, "__version__", None) deps_info = {} @@ -81,16 +85,17 @@ def get_version(module): mod = sys.modules[modname] else: mod = importlib.import_module(modname) - ver = get_version(mod) - deps_info[modname] = ver except ImportError: deps_info[modname] = None + else: + ver = get_version(mod) + deps_info[modname] = ver return deps_info def show_versions(): - """Print python version, OS version, skpro version, selected dependency versions. + """Print python version, OS version, sktime version, selected dependency versions. Pretty prints: diff --git a/skpro/utils/_maint/tests/test_show_versions.py b/skpro/utils/_maint/tests/test_show_versions.py index 928017df1..67c7f1877 100644 --- a/skpro/utils/_maint/tests/test_show_versions.py +++ b/skpro/utils/_maint/tests/test_show_versions.py @@ -1,4 +1,6 @@ """Tests for the show_versions utility.""" +import pathlib +import uuid from skpro.utils._maint._show_versions import ( DEFAULT_DEPS_TO_SHOW, @@ -18,16 +20,39 @@ def test_deps_info(): """Test that _get_deps_info returns package/version dict as per contract.""" deps_info = _get_deps_info() assert isinstance(deps_info, dict) - assert set(deps_info.keys()) == {"skpro"} + assert set(deps_info.keys()) == {"sktime"} deps_info_default = _get_deps_info(DEFAULT_DEPS_TO_SHOW) assert isinstance(deps_info_default, dict) assert set(deps_info_default.keys()) == set(DEFAULT_DEPS_TO_SHOW) + PKG_IMPORT_ALIAS = {"scikit-learn": "sklearn", "scikit-base": "skbase"} + KEY_ALIAS = {"sklearn": "scikit-learn", "skbase": "scikit-base"} + for key in DEFAULT_DEPS_TO_SHOW: - key_is_available = _check_soft_dependencies(key, severity="none") + pkg_name = KEY_ALIAS.get(key, key) + key_is_available = _check_soft_dependencies( + pkg_name, + severity="none", + package_import_alias=PKG_IMPORT_ALIAS, + ) assert (deps_info_default[key] is None) != key_is_available if key_is_available: - assert _check_soft_dependencies(f"{key}=={deps_info_default[key]}") + assert _check_soft_dependencies( + f"{pkg_name}=={deps_info_default[key]}", + package_import_alias=PKG_IMPORT_ALIAS, + ) deps_single_key = _get_deps_info([key]) assert set(deps_single_key.keys()) == {key} + + +def test_deps_info_deps_missing_package_present_directory(): + """Test that _get_deps_info does not fail if a dependency is missing.""" + dummy_package_name = uuid.uuid4().hex + + dummy_folder_path = pathlib.Path(dummy_package_name) + dummy_folder_path.mkdir() + + assert _get_deps_info([dummy_package_name]) == {dummy_package_name: None} + + dummy_folder_path.rmdir() diff --git a/skpro/utils/validation/_dependencies.py b/skpro/utils/validation/_dependencies.py index f7b24fb04..b303924ea 100644 --- a/skpro/utils/validation/_dependencies.py +++ b/skpro/utils/validation/_dependencies.py @@ -2,23 +2,26 @@ __author__ = ["fkiraly", "mloning"] -import io import sys import warnings -from importlib import import_module +from importlib.metadata import PackageNotFoundError, version +from importlib.util import find_spec from inspect import isclass +from packaging.markers import InvalidMarker, Marker from packaging.requirements import InvalidRequirement, Requirement -from packaging.specifiers import InvalidSpecifier, SpecifierSet +from packaging.specifiers import InvalidSpecifier, Specifier, SpecifierSet +from packaging.version import InvalidVersion, Version +# todo 0.32.0: remove suppress_import_stdout argument def _check_soft_dependencies( *packages, package_import_alias=None, severity="error", obj=None, msg=None, - suppress_import_stdout=False, + suppress_import_stdout="deprecated", ): """Check if required soft dependencies are installed and raise error or warning. @@ -40,7 +43,7 @@ def _check_soft_dependencies( should be provided if import name differs from package name severity : str, "error" (default), "warning", "none" behaviour for raising errors or warnings - "error" - raises a `ModuleNotFoundException` if one of packages is not installed + "error" - raises a `ModuleNotFoundError` if one of packages is not installed "warning" - raises a warning if one of packages is not installed function returns False if one of packages is not installed, otherwise True "none" - does not raise exception or warning @@ -52,8 +55,6 @@ def _check_soft_dependencies( if str is passed, will be used as name of the class/object or module msg : str, or None, default=None if str, will override the error message or warning shown with msg - suppress_import_stdout : bool, optional. Default=False - whether to suppress stdout printout upon import. Raises ------ @@ -64,6 +65,22 @@ def _check_soft_dependencies( ------- boolean - whether all packages are installed, only if no exception is raised """ + # todo 0.32.0: remove this warning + if suppress_import_stdout != "deprecated": + warnings.warn( + "In sktime _check_soft_dependencies, the suppress_import_stdout argument " + "is deprecated and no longer has any effect. " + "The argument will be removed in version 0.32.0, so users of the " + "_check_soft_dependencies utility should not pass this argument anymore. " + "The _check_soft_dependencies utility also no longer causes imports, " + "hence no stdout " + "output is created from imports, for any setting of the " + "suppress_import_stdout argument. If you wish to import packages " + "and make use of stdout prints, import the package directly instead.", + DeprecationWarning, + stacklevel=2, + ) + if len(packages) == 1 and isinstance(packages[0], (tuple, list)): packages = packages[0] if not all(isinstance(x, str) for x in packages): @@ -110,6 +127,7 @@ def _check_soft_dependencies( for package in packages: try: req = Requirement(package) + req = _normalize_requirement(req) except InvalidRequirement: msg_version = ( f"wrong format for package requirement string " @@ -127,40 +145,44 @@ def _check_soft_dependencies( package_import_name = package_import_alias[package_name] else: package_import_name = package_name - # attempt import - if not possible, we know we need to raise warning/exception - try: - if suppress_import_stdout: - # setup text trap, import, then restore - sys.stdout = io.StringIO() - pkg_ref = import_module(package_import_name) - sys.stdout = sys.__stdout__ - else: - pkg_ref = import_module(package_import_name) - # if package cannot be imported, make the user aware of installation requirement - except ModuleNotFoundError as e: + + # optimized branching to check presence of import + # and presence of package distribution + # first we check import, then we check distribution + # because try/except consumes more runtime + pkg_spec = find_spec(package_import_name) + if pkg_spec is not None: + try: + pkg_env_version = Version(version(package_name)) + except (InvalidVersion, PackageNotFoundError): + pkg_spec = None + + # if package not present, make the user aware of installation reqs + if pkg_spec is None: if obj is None and msg is None: msg = ( - f"{e}. '{package}' is a soft dependency and not included in the " - f"base skpro installation. Please run: `pip install {package}` to " + f"'{package}' not found. " + f"'{package}' is a soft dependency and not included in the " + f"base sktime installation. Please run: `pip install {package}` to " f"install the {package} package. " f"To install all soft dependencies, run: `pip install " - f"skpro[all_extras]`" + f"sktime[all_extras]`" ) elif msg is None: # obj is not None, msg is None msg = ( f"{class_name} requires package '{package}' to be present " f"in the python environment, but '{package}' was not found. " f"'{package}' is a soft dependency and not included in the base " - f"skpro installation. Please run: `pip install {package}` to " + f"sktime installation. Please run: `pip install {package}` to " f"install the {package} package. " f"To install all soft dependencies, run: `pip install " - f"skpro[all_extras]`" + f"sktime[all_extras]`" ) # if msg is not None, none of the above is executed, # so if msg is passed it overrides the default messages if severity == "error": - raise ModuleNotFoundError(msg) from e + raise ModuleNotFoundError(msg) elif severity == "warning": warnings.warn(msg, stacklevel=2) return False @@ -175,8 +197,6 @@ def _check_soft_dependencies( # now we check compatibility with the version specifier if non-empty if package_version_req != SpecifierSet(""): - pkg_env_version = pkg_ref.__version__ - msg = ( f"{class_name} requires package '{package}' to be present " f"in the python environment, with version {package_version_req}, " @@ -184,7 +204,7 @@ def _check_soft_dependencies( ) if obj is not None: msg = msg + ( - f"This version requirement is not one by skpro, but specific " + f"This version requirement is not one by sktime, but specific " f"to the module, class or object with name {obj}." ) @@ -207,12 +227,94 @@ def _check_soft_dependencies( return True +def _check_dl_dependencies(msg=None, severity="error"): + """Check if deep learning dependencies are installed. + + Parameters + ---------- + msg : str, optional, default= default message (msg below) + error message to be returned in the `ModuleNotFoundError`, overrides default + severity : str, "error" (default), "warning", "none" + behaviour for raising errors or warnings + "error" - raises a ModuleNotFoundError if one of packages is not installed + "warning" - raises a warning if one of packages is not installed + function returns False if one of packages is not installed, otherwise True + "none" - does not raise exception or warning + function returns False if one of packages is not installed, otherwise True + + Raises + ------ + ModuleNotFoundError + User friendly error with suggested action to install deep learning dependencies + + Returns + ------- + boolean - whether all packages are installed, only if no exception is raised + """ + if not isinstance(msg, str): + msg = ( + "tensorflow is required for deep learning functionality in `sktime`. " + "To install these dependencies, run: `pip install sktime[dl]`" + ) + if find_spec("tensorflow") is not None: + return True + else: + if severity == "error": + raise ModuleNotFoundError(msg) + elif severity == "warning": + warnings.warn(msg, stacklevel=2) + return False + elif severity == "none": + return False + else: + raise RuntimeError( + "Error in calling _check_dl_dependencies, severity " + f'argument must be "error", "warning", or "none", found "{severity}".' + ) + + +def _check_mlflow_dependencies(msg=None, severity="error"): + """Check if `mlflow` and its dependencies are installed. + + Parameters + ---------- + msg: str, optional, default= default message (msg below) + error message to be returned when `ModuleNotFoundError` is raised. + severity: str, either of "error", "warning" or "none" + behaviour for raising errors or warnings + "error" - raises a `ModuleNotFound` if mlflow-related packages are not found. + "warning" - raises a warning message if any mlflow-related package is not + installed also returns False. In case all packages are present, + returns True. + "none" - does not raise any exception or warning and simply returns True + if all packages are installed otherwise return False. + + Raise + ----- + ModuleNotFoundError + User Friendly error with a suggested action to install mlflow dependencies + + Returns + ------- + boolean - whether all mlflow-related packages are installed. + """ + if not isinstance(msg, str): + msg = ( + "`mlflow` is an extra dependency and is not included " + "in the base sktime installation. " + "Please run `pip install mlflow` " + "or `pip install sktime[mlflow]` to install the package." + ) + + return _check_soft_dependencies("mlflow", msg=msg, severity=severity) + + def _check_python_version(obj, package=None, msg=None, severity="error"): """Check if system python version is compatible with requirements of obj. Parameters ---------- - obj : skpro estimator, BaseObject descendant + obj : sktime estimator, BaseObject descendant used to check python version package : str, default = None if given, will be used in error message as package name @@ -253,6 +355,7 @@ def _check_python_version(obj, package=None, msg=None, severity="error"): if sys_version in est_specifier: return True # now we know that est_version is not compatible with sys_version + if isclass(obj): class_name = obj.__name__ else: @@ -260,7 +363,6 @@ def _check_python_version(obj, package=None, msg=None, severity="error"): if not isinstance(msg, str): msg = ( - f"{type(obj).__name__} requires python version to be {est_specifier}," f"{class_name} requires python version to be {est_specifier}," f" but system python version is {sys.version}." ) @@ -284,6 +386,81 @@ def _check_python_version(obj, package=None, msg=None, severity="error"): return True +def _check_env_marker(obj, package=None, msg=None, severity="error"): + """Check if packaging marker tag is with requirements of obj. + + Parameters + ---------- + obj : sktime estimator, BaseObject descendant + used to check python version + package : str, default = None + if given, will be used in error message as package name + msg : str, optional, default = default message (msg below) + error message to be returned in the `ModuleNotFoundError`, overrides default + severity : str, "error" (default), "warning", or "none" + whether the check should raise an error, a warning, or nothing + + Returns + ------- + compatible : bool, whether obj is compatible with system python version + check is using the python_version tag of obj + + Raises + ------ + InvalidMarker + User friendly error if obj has env_marker tag that is not a + packaging compatible marker string + ModuleNotFoundError + User friendly error if obj has an env_marker tag that is + incompatible with the python environment. If package is given, + error message gives package as the reason for incompatibility. + """ + est_marker_tag = obj.get_class_tag("env_marker", tag_value_default="None") + if est_marker_tag in ["None", None]: + return True + + try: + est_marker = Marker(est_marker_tag) + except InvalidMarker: + msg_version = ( + f"wrong format for env_marker tag, " + f"must be PEP 508 compatible specifier string, e.g., " + f'platform_system!="windows", but found "{est_marker_tag}"' + ) + raise InvalidMarker(msg_version) + + if est_marker.evaluate(): + return True + # now we know that est_marker is not compatible with the environment + + if isclass(obj): + class_name = obj.__name__ + else: + class_name = type(obj).__name__ + + if not isinstance(msg, str): + msg = ( + f"{class_name} requires an environment to satisfy " + f"packaging marker spec {est_marker}, but enviroment does not satisfy it." + ) + + if package is not None: + msg += f" This is due to requirements of the {package} package." + + if severity == "error": + raise ModuleNotFoundError(msg) + elif severity == "warning": + warnings.warn(msg, stacklevel=2) + elif severity == "none": + return False + else: + raise RuntimeError( + "Error in calling _check_env_marker, severity " + f'argument must be "error", "warning", or "none", found "{severity}".' + ) + return True + + def _check_estimator_deps(obj, msg=None, severity="error"): """Check if object/estimator's package & python requirements are met by python env. @@ -295,13 +472,13 @@ def _check_estimator_deps(obj, msg=None, severity="error"): Parameters ---------- - obj : `skpro` object, `BaseObject` descendant, or list/tuple thereof + obj : `sktime` object, `BaseObject` descendant, or list/tuple thereof object(s) that this function checks compatibility of, with the python env msg : str, optional, default = default message (msg below) error message to be returned in the `ModuleNotFoundError`, overrides default severity : str, "error" (default), "warning", or "none" behaviour for raising errors or warnings - "error" - raises a ModuleNotFoundException if environment is incompatible + "error" - raises a `ModuleNotFoundError` if environment is incompatible "warning" - raises a warning if environment is incompatible function returns False if environment is incompatible, otherwise True "none" - does not raise exception or warning @@ -334,6 +511,7 @@ def _check_estimator_deps(obj, msg=None, severity="error"): return compatible compatible = compatible and _check_python_version(obj, severity=severity) + compatible = compatible and _check_env_marker(obj, severity=severity) pkg_deps = obj.get_class_tag("python_dependencies", None) pck_alias = obj.get_class_tag("python_dependencies_alias", None) @@ -346,3 +524,37 @@ def _check_estimator_deps(obj, msg=None, severity="error"): compatible = compatible and pkg_deps_ok return compatible + + +def _normalize_requirement(req): + """Normalize packaging Requirement by removing build metadata from versions. + + Parameters + ---------- + req : packaging.requirements.Requirement + requirement string to normalize, e.g., Requirement("pandas>1.2.3+foobar") + + Returns + ------- + normalized_req : packaging.requirements.Requirement + normalized requirement object with build metadata removed from versions, + e.g., Requirement("pandas>1.2.3") + """ + # Process each specifier in the requirement + normalized_specs = [] + for spec in req.specifier: + # Parse the version and remove the build metadata + spec_v = Version(spec.version) + version_wo_build_metadata = f"{spec_v.major}.{spec_v.minor}.{spec_v.micro}" + + # Create a new specifier without the build metadata + normalized_spec = Specifier(f"{spec.operator}{version_wo_build_metadata}") + normalized_specs.append(normalized_spec) + + # Reconstruct the specifier set + normalized_specifier_set = SpecifierSet(",".join(str(s) for s in normalized_specs)) + + # Create a new Requirement object with the normalized specifiers + normalized_req = Requirement(f"{req.name}{normalized_specifier_set}") + + return normalized_req