diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e3cb1276b..3bda1e001 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.0.1 + rev: v4.4.0 hooks: - id: check-added-large-files args: ["--maxkb=1000"] @@ -12,9 +12,18 @@ repos: - id: end-of-file-fixer exclude: "^docs/source/examples/" - id: fix-encoding-pragma + args: + - --remove - id: requirements-txt-fixer - id: trailing-whitespace + - repo: https://github.com/asottile/pyupgrade + rev: v3.10.1 + hooks: + - id: pyupgrade + args: + - --py38-plus + - repo: https://github.com/pycqa/isort rev: 5.12.0 hooks: @@ -22,27 +31,27 @@ repos: name: isort - repo: https://github.com/psf/black - rev: 22.3.0 + rev: 23.7.0 hooks: - id: black language_version: python3 # args: [--line-length 79] - repo: https://github.com/pycqa/flake8 - rev: 4.0.1 + rev: 6.1.0 hooks: - id: flake8 exclude: docs/conf.py additional_dependencies: [flake8-bugbear, flake8-print] - repo: https://github.com/mgedmin/check-manifest - rev: "0.47" + rev: "0.49" hooks: - id: check-manifest stages: [manual] - repo: https://github.com/nbQA-dev/nbQA - rev: 1.2.2 + rev: 1.7.0 hooks: - id: nbqa-black args: [--nbqa-mutate, --nbqa-dont-skip-bad-cells] @@ -55,7 +64,7 @@ repos: additional_dependencies: [flake8==3.8.3] - repo: https://github.com/pycqa/pydocstyle - rev: 6.1.1 + rev: 6.3.0 hooks: - id: pydocstyle args: ["--config=setup.cfg"] @@ -63,7 +72,7 @@ repos: # We use the Python version instead of the original version which seems to require Docker # https://github.com/koalaman/shellcheck-precommit - repo: https://github.com/shellcheck-py/shellcheck-py - rev: v0.8.0.4 + rev: v0.9.0.5 hooks: - id: shellcheck name: shellcheck diff --git a/build_tools/changelog.py b/build_tools/changelog.py index 60f5d837e..7ad12ff0a 100644 --- a/build_tools/changelog.py +++ b/build_tools/changelog.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """RestructuredText changelog generator.""" import os @@ -140,7 +139,6 @@ def render_changelog(prs, assigned): # noqa if __name__ == "__main__": - categories = [ {"title": "Enhancements", "labels": ["feature", "enhancement"]}, {"title": "Fixes", "labels": ["bug", "fix", "bugfix"]}, diff --git a/docs/conf.py b/docs/conf.py index b4174a661..0a88d17fe 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # This file is execfile()d with the current directory set to its containing dir. # diff --git a/examples/parametric/bagging.py b/examples/parametric/bagging.py index e3447f551..982abff8b 100644 --- a/examples/parametric/bagging.py +++ b/examples/parametric/bagging.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor from sklearn.tree import DecisionTreeRegressor diff --git a/examples/parametric/hyperparameters.py b/examples/parametric/hyperparameters.py index e3a4df907..6d9ffb6ab 100644 --- a/examples/parametric/hyperparameters.py +++ b/examples/parametric/hyperparameters.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor from sklearn.datasets.base import load_diabetes from sklearn.ensemble import RandomForestRegressor diff --git a/examples/parametric/simple.py b/examples/parametric/simple.py index 4e0d44c2f..89608ea8d 100644 --- a/examples/parametric/simple.py +++ b/examples/parametric/simple.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor from sklearn.datasets.base import load_diabetes from sklearn.ensemble import RandomForestRegressor diff --git a/examples/parametric/workflow.py b/examples/parametric/workflow.py index d28ddc332..8841ff284 100644 --- a/examples/parametric/workflow.py +++ b/examples/parametric/workflow.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression diff --git a/extension_templates/regression.py b/extension_templates/regression.py index ce924bb4e..dc725a33d 100644 --- a/extension_templates/regression.py +++ b/extension_templates/regression.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Extension template for regressors.""" # todo: write an informative docstring for the file or module, remove the above # todo: add an appropriate copyright notice for your estimator @@ -55,8 +54,8 @@ def __init__(self, paramname, paramname2="paramname2default"): self.paramname = paramname self.paramname2 = "paramname2default" - # todo: change "ClassName" to the name of the class - super(ClassName, self).__init__() + # leave this as is + super().__init__() # todo: optional, parameter checking logic (if applicable) should happen here # if writes derived values to self, should *not* overwrite self.parama etc diff --git a/skpro/__init__.py b/skpro/__init__.py index fe27a6075..86e38242c 100644 --- a/skpro/__init__.py +++ b/skpro/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """skpro.""" __version__ = "2.0.0" diff --git a/skpro/base/__init__.py b/skpro/base/__init__.py index 6b663e9cb..3b6ea7681 100644 --- a/skpro/base/__init__.py +++ b/skpro/base/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Base module with base classes BaseObject, BaseEstimator.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) diff --git a/skpro/base/_base.py b/skpro/base/_base.py index f89dd74f7..c2ddb1ef3 100644 --- a/skpro/base/_base.py +++ b/skpro/base/_base.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Base class and template for regressors and transformers.""" from skbase.base import BaseEstimator as _BaseEstimator from skbase.base import BaseMetaEstimator as _BaseMetaEstimator diff --git a/skpro/base/old_base.py b/skpro/base/old_base.py index da51b5bd4..024338a07 100644 --- a/skpro/base/old_base.py +++ b/skpro/base/old_base.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor import abc import functools diff --git a/skpro/benchmarking/__init__.py b/skpro/benchmarking/__init__.py index 2101040cb..743d08af4 100644 --- a/skpro/benchmarking/__init__.py +++ b/skpro/benchmarking/__init__.py @@ -1,2 +1 @@ -# -*- coding: utf-8 -*- """Benchmarking and evaluation.""" diff --git a/skpro/benchmarking/evaluate.py b/skpro/benchmarking/evaluate.py index 9fbede95b..653eb24d3 100644 --- a/skpro/benchmarking/evaluate.py +++ b/skpro/benchmarking/evaluate.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) """Utility for benchmark evaluation of probabilistic regression models.""" # based on the sktime utility of the same name @@ -18,7 +17,6 @@ def _split(X, y, train, test): - # split data according to cv X_train, X_test = X.iloc[train], X.iloc[test] diff --git a/skpro/benchmarking/tests/__init__.py b/skpro/benchmarking/tests/__init__.py index 7489a03e3..f41c3a5b1 100644 --- a/skpro/benchmarking/tests/__init__.py +++ b/skpro/benchmarking/tests/__init__.py @@ -1,2 +1 @@ -# -*- coding: utf-8 -*- """Tests for benchmarking and evaluation.""" diff --git a/skpro/benchmarking/tests/test_evaluate.py b/skpro/benchmarking/tests/test_evaluate.py index 3b2b9d16b..daf005f7e 100644 --- a/skpro/benchmarking/tests/test_evaluate.py +++ b/skpro/benchmarking/tests/test_evaluate.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Tests for evaluate utility.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) # based on the sktime tests of the same name diff --git a/skpro/datatypes/__init__.py b/skpro/datatypes/__init__.py index 146937608..dadd84e22 100644 --- a/skpro/datatypes/__init__.py +++ b/skpro/datatypes/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Module exports: data type definitions, checks, validation, fixtures, converters.""" # this module has been adapted from sktime # it is largely copy-pasting the Proba and Table parts diff --git a/skpro/datatypes/_check.py b/skpro/datatypes/_check.py index 4d6165ecc..7b2edbbb4 100644 --- a/skpro/datatypes/_check.py +++ b/skpro/datatypes/_check.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) """Machine type checkers for scitypes. @@ -41,7 +40,7 @@ def _check_scitype_valid(scitype: str = None): """Check validity of scitype.""" - valid_scitypes = list(set([x[1] for x in check_dict.keys()])) + valid_scitypes = list({x[1] for x in check_dict.keys()}) if not isinstance(scitype, str): raise TypeError(f"scitype should be a str but found {type(scitype)}") diff --git a/skpro/datatypes/_common.py b/skpro/datatypes/_common.py index b1aa50a81..c6ba0a4d5 100644 --- a/skpro/datatypes/_common.py +++ b/skpro/datatypes/_common.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) """Common private utilities for checkers and converters.""" diff --git a/skpro/datatypes/_convert.py b/skpro/datatypes/_convert.py index 775e2b113..27d920e04 100644 --- a/skpro/datatypes/_convert.py +++ b/skpro/datatypes/_convert.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) """Machine type converters for scitypes. @@ -281,8 +280,8 @@ def _conversions_defined(scitype: str): 0 if conversion from i to j is not defined """ pairs = [(x[0], x[1]) for x in list(convert_dict.keys()) if x[2] == scitype] - cols0 = set([x[0] for x in list(convert_dict.keys()) if x[2] == scitype]) - cols1 = set([x[1] for x in list(convert_dict.keys()) if x[2] == scitype]) + cols0 = {x[0] for x in list(convert_dict.keys()) if x[2] == scitype} + cols1 = {x[1] for x in list(convert_dict.keys()) if x[2] == scitype} cols = sorted(list(cols0.union(cols1))) mat = np.zeros((len(cols), len(cols)), dtype=int) diff --git a/skpro/datatypes/_convert_utils/__init__.py b/skpro/datatypes/_convert_utils/__init__.py index cd5229f93..10c5cf84d 100644 --- a/skpro/datatypes/_convert_utils/__init__.py +++ b/skpro/datatypes/_convert_utils/__init__.py @@ -1,2 +1 @@ -# -*- coding: utf-8 -*- """Conversion auxiliary utilities.""" diff --git a/skpro/datatypes/_convert_utils/_coerce.py b/skpro/datatypes/_convert_utils/_coerce.py index 90fe645d0..70cb6cd72 100644 --- a/skpro/datatypes/_convert_utils/_coerce.py +++ b/skpro/datatypes/_convert_utils/_coerce.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Conercion utilities for mtypes.""" __author__ = ["fkiraly"] @@ -7,7 +6,6 @@ def _is_nullable_numeric(dtype): - return dtype in ["Int64", "Float64", "boolean"] diff --git a/skpro/datatypes/_convert_utils/_convert.py b/skpro/datatypes/_convert_utils/_convert.py index ee6132f90..7e6ac8dbe 100644 --- a/skpro/datatypes/_convert_utils/_convert.py +++ b/skpro/datatypes/_convert_utils/_convert.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Conversion utilities for mtypes.""" __author__ = ["fkiraly"] @@ -64,7 +63,7 @@ def _extend_conversions(mtype, anchor_mtype, convert_dict, mtype_universe=None): scitype = list(keys)[0][2] if mtype_universe is None: - mtype_universe = set([x[1] for x in list(keys)]) + mtype_universe = {x[1] for x in list(keys)} mtype_universe = mtype_universe.union([x[0] for x in list(keys)]) for tp in set(mtype_universe).difference([mtype, anchor_mtype]): diff --git a/skpro/datatypes/_examples.py b/skpro/datatypes/_examples.py index 61a9eddd4..9865210c3 100644 --- a/skpro/datatypes/_examples.py +++ b/skpro/datatypes/_examples.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) """Example fixtures for mtypes/scitypes. diff --git a/skpro/datatypes/_proba/__init__.py b/skpro/datatypes/_proba/__init__.py index 31d3e1d01..1c9d34736 100644 --- a/skpro/datatypes/_proba/__init__.py +++ b/skpro/datatypes/_proba/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Type checkers, converters and mtype inference for probabilistic return types.""" from skpro.datatypes._proba._check import check_dict as check_dict_Proba diff --git a/skpro/datatypes/_proba/_check.py b/skpro/datatypes/_proba/_check.py index dcc584f43..4355706c9 100644 --- a/skpro/datatypes/_proba/_check.py +++ b/skpro/datatypes/_proba/_check.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Machine type checkers for Series scitype. Exports checkers for Series scitype: @@ -48,7 +47,6 @@ def check_pred_quantiles_proba(obj, return_metadata=False, var_name="obj"): - metadata = dict() # check if the input is a dataframe @@ -103,7 +101,6 @@ def check_pred_quantiles_proba(obj, return_metadata=False, var_name="obj"): def check_pred_interval_proba(obj, return_metadata=False, var_name="obj"): - metadata = dict() # check if the input is a dataframe diff --git a/skpro/datatypes/_proba/_convert.py b/skpro/datatypes/_proba/_convert.py index bf39ba598..b45996d26 100644 --- a/skpro/datatypes/_proba/_convert.py +++ b/skpro/datatypes/_proba/_convert.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Machine type converters for Series scitype. Exports conversion and mtype dictionary for Series scitype: @@ -44,7 +43,6 @@ def convert_identity(obj, store=None): - return obj @@ -109,7 +107,6 @@ def convert_pred_interval_to_quantiles(y_pred, inplace=False): def convert_interval_to_quantiles(obj: pd.DataFrame, store=None) -> pd.DataFrame: - return convert_pred_interval_to_quantiles(y_pred=obj) @@ -169,7 +166,6 @@ def convert_pred_quantiles_to_interval(y_pred, inplace=False): def convert_quantiles_to_interval(obj: pd.DataFrame, store=None) -> pd.DataFrame: - return convert_pred_quantiles_to_interval(y_pred=obj) diff --git a/skpro/datatypes/_proba/_examples.py b/skpro/datatypes/_proba/_examples.py index caeb97fd3..a87f18269 100644 --- a/skpro/datatypes/_proba/_examples.py +++ b/skpro/datatypes/_proba/_examples.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Example generation for testing. Exports dict of examples, useful for testing as fixtures. diff --git a/skpro/datatypes/_registry.py b/skpro/datatypes/_registry.py index 6001b9902..36cc454a8 100644 --- a/skpro/datatypes/_registry.py +++ b/skpro/datatypes/_registry.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) """Registry of mtypes and scitypes. diff --git a/skpro/datatypes/_table/__init__.py b/skpro/datatypes/_table/__init__.py index c5027e757..ef620b0d9 100644 --- a/skpro/datatypes/_table/__init__.py +++ b/skpro/datatypes/_table/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Module exports: Series type checkers, converters and mtype inference.""" from skpro.datatypes._table._check import check_dict as check_dict_Table diff --git a/skpro/datatypes/_table/_check.py b/skpro/datatypes/_table/_check.py index 2ac0a6102..049a828f3 100644 --- a/skpro/datatypes/_table/_check.py +++ b/skpro/datatypes/_table/_check.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Machine type checkers for Table scitype. Exports checkers for Table scitype: @@ -49,7 +48,6 @@ def check_pddataframe_table(obj, return_metadata=False, var_name="obj"): - metadata = dict() if not isinstance(obj, pd.DataFrame): @@ -79,7 +77,6 @@ def check_pddataframe_table(obj, return_metadata=False, var_name="obj"): def check_pdseries_table(obj, return_metadata=False, var_name="obj"): - metadata = dict() if not isinstance(obj, pd.Series): @@ -112,7 +109,6 @@ def check_pdseries_table(obj, return_metadata=False, var_name="obj"): def check_numpy1d_table(obj, return_metadata=False, var_name="obj"): - metadata = dict() if not isinstance(obj, np.ndarray): @@ -142,7 +138,6 @@ def check_numpy1d_table(obj, return_metadata=False, var_name="obj"): def check_numpy2d_table(obj, return_metadata=False, var_name="obj"): - metadata = dict() if not isinstance(obj, np.ndarray): @@ -171,7 +166,6 @@ def check_numpy2d_table(obj, return_metadata=False, var_name="obj"): def check_list_of_dict_table(obj, return_metadata=False, var_name="obj"): - metadata = dict() if not isinstance(obj, list): diff --git a/skpro/datatypes/_table/_convert.py b/skpro/datatypes/_table/_convert.py index 6d16d006c..de202a54f 100644 --- a/skpro/datatypes/_table/_convert.py +++ b/skpro/datatypes/_table/_convert.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Machine type converters for Table scitype. Exports conversion and mtype dictionary for Table scitype: @@ -45,7 +44,6 @@ def convert_identity(obj, store=None): - return obj @@ -55,7 +53,6 @@ def convert_identity(obj, store=None): def convert_1D_to_2D_numpy_as_Table(obj: np.ndarray, store=None) -> np.ndarray: - if not isinstance(obj, np.ndarray): raise TypeError("input must be a np.ndarray") @@ -71,7 +68,6 @@ def convert_1D_to_2D_numpy_as_Table(obj: np.ndarray, store=None) -> np.ndarray: def convert_2D_to_1D_numpy_as_Table(obj: np.ndarray, store=None) -> np.ndarray: - if not isinstance(obj, np.ndarray): raise TypeError("input must be a np.ndarray") @@ -87,7 +83,6 @@ def convert_2D_to_1D_numpy_as_Table(obj: np.ndarray, store=None) -> np.ndarray: def convert_df_to_2Dnp_as_Table(obj: pd.DataFrame, store=None) -> np.ndarray: - if not isinstance(obj, pd.DataFrame): raise TypeError("input must be a pd.DataFrame") @@ -101,7 +96,6 @@ def convert_df_to_2Dnp_as_Table(obj: pd.DataFrame, store=None) -> np.ndarray: def convert_df_to_1Dnp_as_Table(obj: pd.DataFrame, store=None) -> np.ndarray: - return convert_df_to_2Dnp_as_Table(obj=obj, store=store).flatten() @@ -109,7 +103,6 @@ def convert_df_to_1Dnp_as_Table(obj: pd.DataFrame, store=None) -> np.ndarray: def convert_2Dnp_to_df_as_Table(obj: np.ndarray, store=None) -> pd.DataFrame: - if not isinstance(obj, np.ndarray) and len(obj.shape) != 2: raise TypeError("input must be a 2D np.ndarray") @@ -132,7 +125,6 @@ def convert_2Dnp_to_df_as_Table(obj: np.ndarray, store=None) -> pd.DataFrame: def convert_1Dnp_to_df_as_Table(obj: np.ndarray, store=None) -> pd.DataFrame: - if not isinstance(obj, np.ndarray) and len(obj.shape) != 1: raise TypeError("input must be a 1D np.ndarray") @@ -154,7 +146,6 @@ def convert_1Dnp_to_df_as_Table(obj: np.ndarray, store=None) -> pd.DataFrame: def convert_s_to_df_as_table(obj: pd.Series, store=None) -> pd.DataFrame: - if not isinstance(obj, pd.Series): raise TypeError("input must be a pd.Series") @@ -176,7 +167,6 @@ def convert_s_to_df_as_table(obj: pd.Series, store=None) -> pd.DataFrame: def convert_df_to_s_as_table(obj: pd.DataFrame, store=None) -> pd.Series: - if not isinstance(obj, pd.DataFrame): raise TypeError("input is not a pd.DataFrame") @@ -198,7 +188,6 @@ def convert_df_to_s_as_table(obj: pd.DataFrame, store=None) -> pd.Series: def convert_list_of_dict_to_df_as_table(obj: list, store=None) -> pd.DataFrame: - if not isinstance(obj, list): raise TypeError("input must be a list of dict") @@ -223,7 +212,6 @@ def convert_list_of_dict_to_df_as_table(obj: list, store=None) -> pd.DataFrame: def convert_df_to_list_of_dict_as_table(obj: pd.DataFrame, store=None) -> list: - if not isinstance(obj, pd.DataFrame): raise TypeError("input is not a pd.DataFrame") diff --git a/skpro/datatypes/_table/_examples.py b/skpro/datatypes/_table/_examples.py index 851411b5e..f06385211 100644 --- a/skpro/datatypes/_table/_examples.py +++ b/skpro/datatypes/_table/_examples.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Example generation for testing. Exports dict of examples, useful for testing as fixtures. diff --git a/skpro/datatypes/tests/__init__.py b/skpro/datatypes/tests/__init__.py index fa1dd4743..76ca2253a 100644 --- a/skpro/datatypes/tests/__init__.py +++ b/skpro/datatypes/tests/__init__.py @@ -1,2 +1 @@ -# -*- coding: utf-8 -*- """Tests for data types module.""" diff --git a/skpro/datatypes/tests/test_check.py b/skpro/datatypes/tests/test_check.py index 3477b21a0..56fbda87e 100644 --- a/skpro/datatypes/tests/test_check.py +++ b/skpro/datatypes/tests/test_check.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Testing machine type checkers for scitypes.""" __author__ = ["fkiraly"] @@ -37,7 +36,6 @@ def _generate_scitype_mtype_combinations(): sci_mtype_tuples = [] for scitype in SCITYPES: - mtypes = scitype_to_mtype(scitype) for mtype in mtypes: @@ -85,7 +83,7 @@ def pytest_generate_tests(metafunc): fixturenames = set(metafunc.fixturenames) - if set(["scitype", "mtype", "fixture_index"]).issubset(fixturenames): + if {"scitype", "mtype", "fixture_index"}.issubset(fixturenames): keys = _generate_scitype_mtype_fixtureindex_combinations() ids = [] @@ -95,7 +93,7 @@ def pytest_generate_tests(metafunc): # parameterize test with from-mtpes metafunc.parametrize("scitype,mtype,fixture_index", keys, ids=ids) - elif set(["scitype", "mtype"]).issubset(fixturenames): + elif {"scitype", "mtype"}.issubset(fixturenames): keys = _generate_scitype_mtype_combinations() ids = [] @@ -310,8 +308,7 @@ def test_check_negative(scitype, mtype): for i in range(n_fixtures): # if mtype is not ambiguous, other mtypes are negative examples - for wrong_mtype in list(set(mtypes).difference(set([mtype]))): - + for wrong_mtype in list(set(mtypes).difference({mtype})): # retrieve fixture for checking fixture_wrong_type = fixtures[wrong_mtype].get(i) diff --git a/skpro/datatypes/tests/test_convert.py b/skpro/datatypes/tests/test_convert.py index ae796dc4f..4e4729ece 100644 --- a/skpro/datatypes/tests/test_convert.py +++ b/skpro/datatypes/tests/test_convert.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Testing machine type converters for scitypes.""" __author__ = ["fkiraly"] @@ -22,7 +21,6 @@ def _generate_fixture_tuples(): fixture_tuples = [] for scitype in SCITYPES: - # if we know there are no conversions defined, skip this scitype if scitype in SCITYPES_NO_CONVERSIONS: continue @@ -113,7 +111,6 @@ def test_convert(scitype, from_mtype, to_mtype, fixture_index): # test that converted from-fixture equals to-fixture if cond1 and cond2 and cond3: - converted_fixture_i = convert( obj=from_fixture[0], from_type=from_mtype, diff --git a/skpro/datatypes/tests/test_convert_to.py b/skpro/datatypes/tests/test_convert_to.py index 1e3cef332..4b37fd6f7 100644 --- a/skpro/datatypes/tests/test_convert_to.py +++ b/skpro/datatypes/tests/test_convert_to.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Testing machine type converters for scitypes - covert_to utility.""" __author__ = ["fkiraly"] diff --git a/skpro/datatypes/tests/test_lookup.py b/skpro/datatypes/tests/test_lookup.py index 8f91021f1..b3283fffc 100644 --- a/skpro/datatypes/tests/test_lookup.py +++ b/skpro/datatypes/tests/test_lookup.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Testing mtype/scitypes lookup.""" __author__ = ["fkiraly"] diff --git a/skpro/distributions/__init__.py b/skpro/distributions/__init__.py index d559c42f1..bec9eb78f 100644 --- a/skpro/distributions/__init__.py +++ b/skpro/distributions/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Probability distribution objects.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) # adapted from sktime diff --git a/skpro/distributions/base.py b/skpro/distributions/base.py index 82e1014c7..3da06353d 100644 --- a/skpro/distributions/base.py +++ b/skpro/distributions/base.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) """Base classes for probability distribution objects.""" @@ -33,11 +32,10 @@ class BaseDistribution(BaseObject): } def __init__(self, index=None, columns=None): - self.index = index self.columns = columns - super(BaseDistribution, self).__init__() + super().__init__() _check_estimator_deps(self) @property @@ -89,7 +87,6 @@ def _loc(self, rowidx=None, colidx=None): return self._iloc(rowidx=row_iloc, colidx=col_iloc) def _subset_params(self, rowidx, colidx): - params = self._get_dist_params() subset_param_dict = {} @@ -131,7 +128,6 @@ def subset_not_none(idx, subs): ) def _get_dist_params(self): - params = self.get_params(deep=False) paramnames = params.keys() reserved_names = ["index", "columns"] @@ -585,13 +581,11 @@ class _BaseTFDistribution(BaseDistribution): } def __init__(self, index=None, columns=None, distr=None): - self.distr = distr - super(_BaseTFDistribution, self).__init__(index=index, columns=columns) + super().__init__(index=index, columns=columns) def __str__(self): - return self.to_str() def pdf(self, x): diff --git a/skpro/distributions/empirical.py b/skpro/distributions/empirical.py index 30ecf0236..f69a91393 100644 --- a/skpro/distributions/empirical.py +++ b/skpro/distributions/empirical.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) """Empirical distribution.""" @@ -125,11 +124,10 @@ def _apply_per_ix(self, func, params, x=None): x_t = x.loc[ix, col] else: x_t = x - res.loc[ix, col] = func(spl=spl_t, weights=weights_t, x=x_t, **params) - return res.convert_dtypes() + res.at[ix, col] = func(spl=spl_t, weights=weights_t, x=x_t, **params) + return res.apply(pd.to_numeric) def _iloc(self, rowidx=None, colidx=None): - index = self.index columns = self.columns weights = self.weights diff --git a/skpro/distributions/laplace.py b/skpro/distributions/laplace.py index 957a9745e..5cfaa4cf2 100644 --- a/skpro/distributions/laplace.py +++ b/skpro/distributions/laplace.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) """Laplace probability distribution.""" @@ -36,7 +35,6 @@ class Laplace(BaseDistribution): } def __init__(self, mu, scale, index=None, columns=None): - self.mu = mu self.scale = scale self.index = index diff --git a/skpro/distributions/mixture.py b/skpro/distributions/mixture.py index a822fb106..1b1ca159c 100644 --- a/skpro/distributions/mixture.py +++ b/skpro/distributions/mixture.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) """Mixture distribution.""" @@ -43,7 +42,6 @@ class Mixture(BaseMetaObject, BaseDistribution): } def __init__(self, distributions, weights=None, index=None, columns=None): - self.distributions = distributions self.weights = weights self.index = index @@ -66,7 +64,6 @@ def __init__(self, distributions, weights=None, index=None, columns=None): super().__init__(index=index, columns=columns) def _iloc(self, rowidx=None, colidx=None): - dists = self._distributions weights = self.weights diff --git a/skpro/distributions/normal.py b/skpro/distributions/normal.py index 4d761598c..470df7ab9 100644 --- a/skpro/distributions/normal.py +++ b/skpro/distributions/normal.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) """Normal/Gaussian probability distribution.""" @@ -37,7 +36,6 @@ class Normal(BaseDistribution): } def __init__(self, mu, sigma, index=None, columns=None): - self.mu = mu self.sigma = sigma self.index = index @@ -55,7 +53,7 @@ def __init__(self, mu, sigma, index=None, columns=None): if columns is None: columns = pd.RangeIndex(shape[1]) - super(Normal, self).__init__(index=index, columns=columns) + super().__init__(index=index, columns=columns) def energy(self, x=None): r"""Energy of self, w.r.t. self or a constant frame x. diff --git a/skpro/distributions/t.py b/skpro/distributions/t.py index d82aac054..98fd8a3ce 100644 --- a/skpro/distributions/t.py +++ b/skpro/distributions/t.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Student's t-distribution.""" # copyright: skprodevelopers, BSD-3-Clause License (see LICENSE file) @@ -16,9 +15,9 @@ class TDistribution(BaseDistribution): Parameters ---------- - mean : float or array of float (1D or 2D) + mu : float or array of float (1D or 2D) mean of the t-distribution distribution - sd : float or array of float (1D or 2D), must be positive + sigma : float or array of float (1D or 2D), must be positive standard deviation of the t-distribution distribution df : float or array of float (1D or 2D), must be positive Degrees of freedom of the t-distribution diff --git a/skpro/distributions/tests/__init__.py b/skpro/distributions/tests/__init__.py index f6a30a95c..2a43a676e 100644 --- a/skpro/distributions/tests/__init__.py +++ b/skpro/distributions/tests/__init__.py @@ -1,2 +1 @@ -# -*- coding: utf-8 -*- """Tests for skpro probability distribution objects.""" diff --git a/skpro/distributions/tests/test_all_distrs.py b/skpro/distributions/tests/test_all_distrs.py index 40b2694ce..466ada417 100644 --- a/skpro/distributions/tests/test_all_distrs.py +++ b/skpro/distributions/tests/test_all_distrs.py @@ -1,9 +1,8 @@ -# -*- coding: utf-8 -*- """Tests for BaseDistribution API points.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) # adapted from sktime -__author__ = ["fkiraly"] +__author__ = ["fkiraly", "Alex-JG3"] import numpy as np import pandas as pd @@ -185,6 +184,29 @@ def test_subsetting(self, object_instance, subset_row, subset_col): assert (res_iloc.index == ix_loc).all() assert (res_iloc.columns == iy_loc).all() + def test_log_pdf_and_pdf(self, object_instance): + """Test that the log of the pdf and log_pdf function are similar.""" + d = object_instance + capabilities_exact = d.get_tags()["capabilities:exact"] + + if "log_pdf" not in capabilities_exact or "pdf" not in capabilities_exact: + return + x = d.sample() + pdf = d.pdf(x) + log_pdf = d.log_pdf(x) + assert np.allclose(np.log(pdf), log_pdf) + + def test_ppf_and_cdf(self, object_instance): + """Test that the ppf is the inverse of the cdf.""" + d = object_instance + capabilities_exact = d.get_tags()["capabilities:exact"] + + if "ppf" not in capabilities_exact or "cdf" not in capabilities_exact: + return + x = d.sample() + x_approx = d.ppf(d.cdf(x)) + assert np.allclose(x.values, x_approx.values) + def _check_output_format(res, dist, method): """Check output format expectations for BaseDistribution tests.""" @@ -200,6 +222,13 @@ def _check_output_format(res, dist, method): if method in METHODS_SCALAR_POS or method in METHODS_X_POS: assert (res >= 0).all().all() + if isinstance(res, pd.DataFrame): + assert res.apply(pd.api.types.is_numeric_dtype).all() + elif isinstance(res, pd.Series): + assert pd.api.types.is_numeric_dtype(res) + else: + raise TypeError("res must be a pandas DataFrame or Series.") + def _shuffle_distr(d): """Shuffle distribution row index.""" diff --git a/skpro/distributions/tests/test_base_default_methods.py b/skpro/distributions/tests/test_base_default_methods.py index 862ad359e..2e6814c15 100644 --- a/skpro/distributions/tests/test_base_default_methods.py +++ b/skpro/distributions/tests/test_base_default_methods.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Test class for default methods. This is not for direct use, but for testing whether the defaulting in various @@ -31,7 +30,6 @@ class _DistrDefaultMethodTester(BaseDistribution): } def __init__(self, mu, sigma, index=None, columns=None): - self.mu = mu self.sigma = sigma self.index = index @@ -46,7 +44,7 @@ def __init__(self, mu, sigma, index=None, columns=None): if columns is None: columns = pd.RangeIndex(shape[1]) - super(_DistrDefaultMethodTester, self).__init__(index=index, columns=columns) + super().__init__(index=index, columns=columns) def ppf(self, p): """Quantile function = percent point function = inverse cdf.""" diff --git a/skpro/distributions/tests/test_proba_basic.py b/skpro/distributions/tests/test_proba_basic.py index f1755b2a8..6dc4ec31c 100644 --- a/skpro/distributions/tests/test_proba_basic.py +++ b/skpro/distributions/tests/test_proba_basic.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Non-suite tests for probability distribution objects.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) # adapted from sktime diff --git a/skpro/metrics/__init__.py b/skpro/metrics/__init__.py index aedbc1f7f..21f0f1d40 100644 --- a/skpro/metrics/__init__.py +++ b/skpro/metrics/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Metrics for probabilistic supervised regression.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) # adapted from sktime diff --git a/skpro/metrics/_classes.py b/skpro/metrics/_classes.py index b4c8a5129..86dca9da8 100644 --- a/skpro/metrics/_classes.py +++ b/skpro/metrics/_classes.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Concrete performance metrics for probabilistic supervised regression.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) # adapted from sktime diff --git a/skpro/metrics/_coerce.py b/skpro/metrics/_coerce.py index c035aed7d..7f3e1dde6 100644 --- a/skpro/metrics/_coerce.py +++ b/skpro/metrics/_coerce.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Output coercion utilities for metric classes.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) # adapted from sktime diff --git a/skpro/metrics/base.py b/skpro/metrics/base.py index 76629d2ae..facdc51c3 100644 --- a/skpro/metrics/base.py +++ b/skpro/metrics/base.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Base classes for probabilistic metrics.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) # adapted from sktime @@ -44,7 +43,7 @@ class BaseProbaMetric(BaseObject): def __init__(self, multioutput="uniform_average", score_average=True): self.multioutput = multioutput self.score_average = score_average - super(BaseProbaMetric, self).__init__() + super().__init__() def __call__(self, y_true, y_pred, **kwargs): """Calculate metric value using underlying metric function. @@ -317,7 +316,7 @@ def _check_ys(self, y_true, y_pred, multioutput): def _get_alpha_from(self, y_pred): """Fetch the alphas present in y_pred.""" alphas = np.unique(list(y_pred.columns.get_level_values(1))) - if not all(((alphas > 0) & (alphas < 1))): + if not all((alphas > 0) & (alphas < 1)): raise ValueError("Alpha must be between 0 and 1.") return alphas @@ -333,7 +332,7 @@ def _check_alpha(self, alpha): if not isinstance(alpha, np.ndarray): alpha = np.asarray(alpha) - if not all(((alpha > 0) & (alpha < 1))): + if not all((alpha > 0) & (alpha < 1)): raise ValueError("Alpha must be between 0 and 1.") return alpha diff --git a/skpro/metrics/tests/__init__.py b/skpro/metrics/tests/__init__.py index c7e85f120..089516370 100644 --- a/skpro/metrics/tests/__init__.py +++ b/skpro/metrics/tests/__init__.py @@ -1,2 +1 @@ -# -*- coding: utf-8 -*- """Tests for probabilistic metrics.""" diff --git a/skpro/metrics/tests/test_distr_metrics.py b/skpro/metrics/tests/test_distr_metrics.py index 3d6229852..94f882650 100644 --- a/skpro/metrics/tests/test_distr_metrics.py +++ b/skpro/metrics/tests/test_distr_metrics.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Tests for probabilistic metrics for distribution predictions.""" import warnings diff --git a/skpro/metrics/tests/test_probabilistic_metrics.py b/skpro/metrics/tests/test_probabilistic_metrics.py index 7ddc034c1..94c5f851c 100644 --- a/skpro/metrics/tests/test_probabilistic_metrics.py +++ b/skpro/metrics/tests/test_probabilistic_metrics.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Tests for probabilistic quantile and interval metrics.""" import pandas as pd import pytest diff --git a/skpro/model_selection/__init__.py b/skpro/model_selection/__init__.py index 92a0bc5ff..262fe9228 100644 --- a/skpro/model_selection/__init__.py +++ b/skpro/model_selection/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Tuning and model selection.""" __all__ = ["GridSearchCV", "RandomizedSearchCV"] diff --git a/skpro/model_selection/_tuning.py b/skpro/model_selection/_tuning.py index 2bff11672..bccfca91f 100644 --- a/skpro/model_selection/_tuning.py +++ b/skpro/model_selection/_tuning.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) """Tuning of probabilistic supervised regressors.""" diff --git a/skpro/registry/__init__.py b/skpro/registry/__init__.py index bf54ebc12..f6d35ed64 100644 --- a/skpro/registry/__init__.py +++ b/skpro/registry/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Registry and lookup functionality.""" from skpro.registry._lookup import all_objects diff --git a/skpro/registry/_lookup.py b/skpro/registry/_lookup.py index 52e3db856..db42e6430 100644 --- a/skpro/registry/_lookup.py +++ b/skpro/registry/_lookup.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Registry lookup methods. This module exports the following methods for registry lookup: @@ -23,7 +22,7 @@ from skpro.base import BaseEstimator, BaseObject from skpro.registry._tags import OBJECT_TAG_REGISTER -VALID_OBJECT_TYPE_STRINGS = set([x[1] for x in OBJECT_TAG_REGISTER]) +VALID_OBJECT_TYPE_STRINGS = {x[1] for x in OBJECT_TAG_REGISTER} def all_objects( diff --git a/skpro/registry/_tags.py b/skpro/registry/_tags.py index 9b1f56081..b4cfa5766 100644 --- a/skpro/registry/_tags.py +++ b/skpro/registry/_tags.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Register of estimator and object tags. Note for extenders: new tags should be entered in OBJECT_TAG_REGISTER. diff --git a/skpro/registry/tests/__init__.py b/skpro/registry/tests/__init__.py index 565629c22..f331499ed 100644 --- a/skpro/registry/tests/__init__.py +++ b/skpro/registry/tests/__init__.py @@ -1,2 +1 @@ -# -*- coding: utf-8 -*- """Tests for registry and lookup functionality.""" diff --git a/skpro/registry/tests/test_tags.py b/skpro/registry/tests/test_tags.py index 109e2255b..45c79ac62 100644 --- a/skpro/registry/tests/test_tags.py +++ b/skpro/registry/tests/test_tags.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Tests for tag register an tag functionality.""" from skpro.registry._tags import OBJECT_TAG_REGISTER diff --git a/skpro/regression/__init__.py b/skpro/regression/__init__.py index 6e6efae71..00361778e 100644 --- a/skpro/regression/__init__.py +++ b/skpro/regression/__init__.py @@ -1,2 +1 @@ -# -*- coding: utf-8 -*- """Probabilitistic supervised regression estimators.""" diff --git a/skpro/regression/base/__init__.py b/skpro/regression/base/__init__.py index 44976f0d5..ddfc8d0b7 100644 --- a/skpro/regression/base/__init__.py +++ b/skpro/regression/base/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Base classes for probabilistic regression.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) diff --git a/skpro/regression/base/_base.py b/skpro/regression/base/_base.py index ebb062e5a..6ca02809d 100644 --- a/skpro/regression/base/_base.py +++ b/skpro/regression/base/_base.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Base class for probabilistic regression.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) @@ -20,11 +19,10 @@ class BaseProbaRegressor(BaseEstimator): } def __init__(self, index=None, columns=None): - self.index = index self.columns = columns - super(BaseProbaRegressor, self).__init__() + super().__init__() _check_estimator_deps(self) def __rmul__(self, other): @@ -392,7 +390,6 @@ def _predict_quantiles(self, X, alpha): raise NotImplementedError if implements_interval: - pred_int = pd.DataFrame() for a in alpha: # compute quantiles corresponding to prediction interval coverage @@ -424,7 +421,6 @@ def _predict_quantiles(self, X, alpha): pred_int.columns = int_idx elif implements_proba: - pred_proba = self.predict_proba(X=X) pred_int = pred_proba.quantile(alpha=alpha) @@ -522,7 +518,6 @@ def _predict_var(self, X): return pred_var def _check_X_y(self, X, y): - X = self._check_X(X) y = self._check_y(y) diff --git a/skpro/regression/base/_delegate.py b/skpro/regression/base/_delegate.py index 036e12b77..622bc5cf5 100644 --- a/skpro/regression/base/_delegate.py +++ b/skpro/regression/base/_delegate.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Delegator mixin that delegates all methods to wrapped regressor. Useful for building estimators where all but one or a few methods are delegated. For diff --git a/skpro/regression/bootstrap.py b/skpro/regression/bootstrap.py index 130d8cd3e..cb0b3df3b 100644 --- a/skpro/regression/bootstrap.py +++ b/skpro/regression/bootstrap.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Probabilistic regression by bootstrap.""" __author__ = ["fkiraly"] @@ -26,7 +25,7 @@ class BootstrapRegressor(BaseProbaRegressor): and whether sampling is with or without replacement. Direct generalization of ``sklearn``'s ``BaggingClassifier`` - to the probabilistic regrsesion task. + to the probabilistic regression task. Parameters ---------- diff --git a/skpro/regression/compose/__init__.py b/skpro/regression/compose/__init__.py index cb95d4a9d..1d23be43d 100644 --- a/skpro/regression/compose/__init__.py +++ b/skpro/regression/compose/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Composition and pipelines for probabilistic supervised regression.""" from skpro.regression.compose._pipeline import Pipeline diff --git a/skpro/regression/compose/_pipeline.py b/skpro/regression/compose/_pipeline.py index cfdfdb55c..c4e9672d7 100644 --- a/skpro/regression/compose/_pipeline.py +++ b/skpro/regression/compose/_pipeline.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Implements pipelines for probabilistic supervised regression.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) # based on sktime pipelines diff --git a/skpro/regression/density.py b/skpro/regression/density.py index fd2b4a2a4..6383db0bf 100644 --- a/skpro/regression/density.py +++ b/skpro/regression/density.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor import abc diff --git a/skpro/regression/ensemble.py b/skpro/regression/ensemble.py index 71c6c48cd..3683bec19 100644 --- a/skpro/regression/ensemble.py +++ b/skpro/regression/ensemble.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor import numpy as np diff --git a/skpro/regression/residual.py b/skpro/regression/residual.py index 72f3552ad..6d41594e6 100644 --- a/skpro/regression/residual.py +++ b/skpro/regression/residual.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Residual regression - one regressor for mean, one for scale.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) @@ -8,41 +7,97 @@ import pandas as pd from sklearn import clone -from skpro.distributions.normal import Normal from skpro.regression.base import BaseProbaRegressor class ResidualDouble(BaseProbaRegressor): """Residual double regressor. - One regressor predicting the mean, and one the deviation from the mean. + Make a parametric probabilistic prediction using two tabular regressors, with + one tabular regressor predicting the mean, and one the deviation from the mean. - TODO - math description + The mean is predicted by ``estimator``. The residual is predicted by + ``estimator_resid``. The residual is transformed by ``residual_trafo``. + The predicted mean and residual are passed to a distribution specified by + ``distr_type``, and possibly ``distr_params``, ``distr_loc_scale_name``. + + The residuals predicted on the training data are used to fit + ``estimator_resid``. If ``cv`` is passed, the residuals are out-of-sample + according to ``cv``, otherwise in-sample. + + ``use_y_pred`` determines whether the predicted mean is used as a feature + in predicting the residual. + + A formal description of the algorithm follows. + + In ``fit``, given training data ``X``, ``y``: + + 1. Fit clone ``estimator_`` of ``estimator`` to predict ``y`` from ``X``, + i.e., ``fit(X, y)``. + 2. Predict mean label ``y_pred`` for ``X`` using a clone of ``estimator``. + If ``cv`` is ``None``, this is via plain ``estimator.predict(X)``. + If ``cv`` is not ``None``, out-of-sample predictions are obtained via ``cv``. + In this case, indices not appearing in ``cv`` are predicted in-sample. + 3. Compute residual ``resid`` as ``residual_trafo(y - y_pred)``. + If ``residual_trafo`` is a transformer, ``residual_trafo.fit_transform`` is used. + 4. Fit clone ``estimator_resid_`` of ``estimator_resid`` + to predict ``resid`` from ``X``, i.e., ``fit(X, resid)``. + If ``use_y_pred`` is ``True``, ``y_pred`` is used as a feature in predicting. + + In ``predict``, given test data ``X``: + + 1. Predict mean label ``y_pred`` for ``X`` using ``estimator_.predict(X)``. + 2. Return ``y_pred``. + + In ``predict_proba``, given test data ``X``: + + 1. Predict mean label ``y_pred`` for ``X`` using ``estimator_.predict(X)``. + 2. Predict residual ``resid`` for ``X`` using ``estimator_resid_.predict(X)``. + If ``use_y_pred`` is ``True``, ``y_pred`` is used as a feature in predicting. + 3. Predict distribution ``y_pred_proba`` for ``X`` as follows: + The location parameter is ``y_pred``. The scale parameter is ``resid``. + Further parameters can be specified via ``distr_params``. + 4. Return ``y_pred_proba``. Parameters ---------- - estimator : skpro estimator, BaseProbaRegressor descendant + estimator : sklearn regressor estimator predicting the mean or location - estimator_resid : skpro estimator, BaseProbaRegressor descendant, optional + estimator_resid : sklearn regressor estimator predicting the scale of the residual default = sklearn DummyRegressor(strategy="mean") - - TODO - add - estimator_resid : skpro estimator or dict of estimators with str keys residual_trafo : str, or transformer, default="absolute" determines the labels predicted by ``estimator_resid`` absolute = absolute residuals squared = squared residuals + if transformer, applies fit_transform to batch of signed residuals distr_type : str or BaseDistribution, default = "Normal" type of distribution to predict str options are "Normal", "Laplace", "Cauchy", "t" + distr_loc_scale_name : tuple of length two, default = ("loc", "scale") + names of the parameters in the distribution to use for location and scale + if ``distr_type`` is a string, this is overridden to the correct parameters + if ``distr_type`` is a BaseDistribution, this is used to determine the + location and scale parameters that the predictions are passed to + distr_params : dict, default = {} + parameters to pass to the distribution + must be valid parameters of ``distr_type``, if ``BaseDistribution`` + must be default or dict with key ``df``, if ``t`` distribution use_y_pred : bool, default=False whether to use the predicted location in predicting the scale of the residual cv : optional, sklearn cv splitter, default = None if passed, will be used to obtain out-of-sample residuals according to cv instead of in-sample residuals in ``fit`` of this estimator min_scale : float, default=1e-10 - minimum scale parameter if ``estimator_resid`` is an estimator (not dict) + minimum scale parameter. If smaller scale parameter is predicted by + ``estimator_resid``, will be clipped to this value + + Attributes + ---------- + estimator_ : sklearn regressor, clone of ``estimator`` + fitted estimator predicting the mean or location + estimator_resid_ : sklearn regressor, clone of ``estimator_resid`` + fitted estimator predicting the scale of the residual Example ------- @@ -64,13 +119,29 @@ class ResidualDouble(BaseProbaRegressor): _tags = {"capability:missing": True} - def __init__(self, estimator, estimator_resid=None, min_scale=1e-10): - + def __init__( + self, + estimator, + estimator_resid=None, + residual_trafo="absolute", + distr_type="Normal", + distr_loc_scale_name=None, + distr_params=None, + use_y_pred=False, + cv=None, + min_scale=1e-10, + ): self.estimator = estimator self.estimator_resid = estimator_resid + self.residual_trafo = residual_trafo + self.distr_type = distr_type + self.distr_loc_scale_name = distr_loc_scale_name + self.distr_params = distr_params + self.use_y_pred = use_y_pred + self.cv = cv self.min_scale = min_scale - super(ResidualDouble, self).__init__() + super().__init__() self.estimator_ = clone(estimator) @@ -81,6 +152,36 @@ def __init__(self, estimator, estimator_resid=None, min_scale=1e-10): else: self.estimator_resid_ = clone(estimator_resid) + def _predict_residuals_cv(self, X, y, cv, est): + """Predict out-of-sample residuals for y from X using cv. + + Parameters + ---------- + X : pandas DataFrame + feature instances to fit regressor to + y : pandas DataFrame, must be same length as X + labels to fit regressor to + cv : sklearn cv splitter + cv splitter to use for out-of-sample predictions + + Returns + ------- + y_pred : pandas DataFrame, same length as `X`, same columns as `y` in `fit` + labels predicted for `X` + """ + est = self.estimator_resid + method = "predict" + y_pred = y.copy() + + for tr_idx, tt_idx in cv.split(X): + X_train = X.iloc[tr_idx] + X_test = X.iloc[tt_idx] + y_train = y[tr_idx] + fitted_est = clone(est).fit(X_train, y_train) + y_pred[tt_idx] = getattr(fitted_est, method)(X_test) + + return y_pred + def _fit(self, X, y): """Fit regressor to training data. @@ -100,16 +201,36 @@ def _fit(self, X, y): """ est = self.estimator_ est_r = self.estimator_resid_ + residual_trafo = self.residual_trafo + cv = self.cv + use_y_pred = self.use_y_pred self._y_cols = y.columns y = y.values.flatten() est.fit(X, y) - resids = np.abs(y - est.predict(X)) + + if cv is None: + y_pred = est.predict(X) + else: + y_pred = self._predict_residuals_cv(X, y, cv, est) + + if residual_trafo == "absolute": + resids = np.abs(y - y_pred) + elif residual_trafo == "squared": + resids = (y - y_pred) ** 2 + else: + resids = residual_trafo.fit_transform(y - y_pred) resids = resids.flatten() - est_r.fit(X, resids) + if use_y_pred: + y_ix = {"index": X.index, "columns": self._y_cols} + X_r = pd.concat([X, pd.DataFrame(y_pred, **y_ix)], axis=1) + else: + X_r = X + + est_r.fit(X_r, resids) return self @@ -160,19 +281,65 @@ def _predict_proba(self, X): """ est = self.estimator_ est_r = self.estimator_resid_ + use_y_pred = self.use_y_pred + distr_type = self.distr_type + distr_loc_scale_name = self.distr_loc_scale_name + distr_params = self.distr_params min_scale = self.min_scale + if distr_params is None: + distr_params = {} + + # predict location - this is the same as in _predict y_pred_loc = est.predict(X) y_pred_loc = y_pred_loc.reshape(-1, 1) - y_pred_scale = est_r.predict(X) + # predict scale + # if use_y_pred, use predicted location as feature + if use_y_pred: + y_ix = {"index": X.index, "columns": self._y_cols} + X_r = pd.concat([X, pd.DataFrame(y_pred_loc, **y_ix)], axis=1) + # if not use_y_pred, use only original features + else: + X_r = X + + y_pred_scale = est_r.predict(X_r) y_pred_scale = y_pred_scale.clip(min=min_scale) y_pred_scale = y_pred_scale.reshape(-1, 1) - y_pred = Normal( - mu=y_pred_loc, sigma=y_pred_scale, index=X.index, columns=self._y_cols - ) + # create distribution with predicted scale and location + # we deal with string distr_types by getting class and param names + if distr_type == "Normal": + from skpro.distributions.normal import Normal + + distr_type = Normal + distr_loc_scale_name = ("mu", "sigma") + elif distr_type == "Laplace": + from skpro.distributions.laplace import Laplace + + distr_type = Laplace + distr_loc_scale_name = ("mu", "scale") + elif distr_type in ["Cauchy", "t"]: + from skpro.distributions.t import TDistribution + + distr_type = TDistribution + distr_loc_scale_name = ("mu", "sigma") + + # collate all parameters for the distribution constructor + # distribution params, if passed + params = distr_params + # row/column index + ix = {"index": X.index, "columns": self._y_cols} + params.update(ix) + # location and scale + loc_scale = { + distr_loc_scale_name[0]: y_pred_loc, + distr_loc_scale_name[1]: y_pred_scale, + } + params.update(loc_scale) + # create distribution and return + y_pred = distr_type(**params) return y_pred @classmethod @@ -195,12 +362,25 @@ def get_test_params(cls, parameter_set="default"): """ from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression + from sklearn.model_selection import KFold params1 = {"estimator": RandomForestRegressor()} params2 = { "estimator": LinearRegression(), "estimator_resid": RandomForestRegressor(), "min_scale": 1e-7, + "residual_trafo": "squared", + "use_y_pred": True, + "distr_type": "Laplace", + } + params3 = { + "estimator": LinearRegression(), + "estimator_resid": RandomForestRegressor(), + "min_scale": 1e-6, + "use_y_pred": True, + "distr_type": "t", + "distr_params": {"df": 3}, + "cv": KFold(n_splits=3), } - return [params1, params2] + return [params1, params2, params3] diff --git a/skpro/regression/tests/__init__.py b/skpro/regression/tests/__init__.py index e108a35d7..19626df61 100644 --- a/skpro/regression/tests/__init__.py +++ b/skpro/regression/tests/__init__.py @@ -1,2 +1 @@ -# -*- coding: utf-8 -*- """Tests for probabilistic supervised regressors.""" diff --git a/skpro/regression/tests/test_all_regressors.py b/skpro/regression/tests/test_all_regressors.py index 8b831d1a7..81b78118c 100644 --- a/skpro/regression/tests/test_all_regressors.py +++ b/skpro/regression/tests/test_all_regressors.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Automated tests based on the skbase test suite template.""" import pandas as pd import pytest diff --git a/skpro/tests/__init__.py b/skpro/tests/__init__.py index f0395b086..f17b879f3 100644 --- a/skpro/tests/__init__.py +++ b/skpro/tests/__init__.py @@ -1,2 +1 @@ -# -*- coding: utf-8 -*- """Tests for skpro package.""" diff --git a/skpro/tests/conftest.py b/skpro/tests/conftest.py deleted file mode 100644 index 64e794d8e..000000000 --- a/skpro/tests/conftest.py +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -""" - Dummy conftest.py for skpro. - - If you don't know what this is for, just leave it empty. - Read more about conftest.py under: - https://pytest.org/latest/plugins.html -""" -from __future__ import absolute_import, division, print_function - -import pytest diff --git a/skpro/tests/test_all_estimators.py b/skpro/tests/test_all_estimators.py index 842baa2a0..7ae206337 100644 --- a/skpro/tests/test_all_estimators.py +++ b/skpro/tests/test_all_estimators.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Automated tests based on the skbase test suite template.""" import numbers import types diff --git a/skpro/tests/test_base.py b/skpro/tests/test_base.py index 223295fe6..a1498f792 100644 --- a/skpro/tests/test_base.py +++ b/skpro/tests/test_base.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor import numpy as np diff --git a/skpro/tests/test_baselines.py b/skpro/tests/test_baselines.py index a828b5fb7..4ecc1e37d 100644 --- a/skpro/tests/test_baselines.py +++ b/skpro/tests/test_baselines.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Legacy module: test baselines.""" # LEGACY MODULE - TODO: remove or refactor diff --git a/skpro/tests/test_density.py b/skpro/tests/test_density.py index 6844e27f1..5628a120d 100644 --- a/skpro/tests/test_density.py +++ b/skpro/tests/test_density.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor if False: diff --git a/skpro/tests/test_ensemble.py b/skpro/tests/test_ensemble.py index 00c90e468..b6222b1c8 100644 --- a/skpro/tests/test_ensemble.py +++ b/skpro/tests/test_ensemble.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor import pytest diff --git a/skpro/tests/test_parametric.py b/skpro/tests/test_parametric.py index 37d4bcff8..071fcdbf1 100644 --- a/skpro/tests/test_parametric.py +++ b/skpro/tests/test_parametric.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Legacy module: test parametric.""" # LEGACY MODULE - TODO: remove or refactor diff --git a/skpro/tests/test_vendors.py b/skpro/tests/test_vendors.py index 6dfdf57fd..852c1d889 100644 --- a/skpro/tests/test_vendors.py +++ b/skpro/tests/test_vendors.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor import pytest @@ -9,7 +8,6 @@ @pytest.mark.skip(reason="avoiding pymc3 dependency for now") def test_construct_estimator(): - with pytest.raises(ValueError): BayesianVendorEstimator() diff --git a/skpro/tests/utils.py b/skpro/tests/utils.py index b8adb17aa..76bd795c6 100644 --- a/skpro/tests/utils.py +++ b/skpro/tests/utils.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Legacy module: test utils.""" # LEGACY MODULE - TODO: remove or refactor diff --git a/skpro/utils/__init__.py b/skpro/utils/__init__.py index ee389a940..45e7d572a 100644 --- a/skpro/utils/__init__.py +++ b/skpro/utils/__init__.py @@ -1,2 +1 @@ -# -*- coding: utf-8 -*- """Utility functionality.""" diff --git a/skpro/utils/_maint/_show_versions.py b/skpro/utils/_maint/_show_versions.py index e3f873b91..542c5b9bd 100644 --- a/skpro/utils/_maint/_show_versions.py +++ b/skpro/utils/_maint/_show_versions.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -u -# -*- coding: utf-8 -*- # License: BSD 3 clause """Utility methods to print system info for debugging. @@ -110,8 +109,8 @@ def show_versions(): print("\nSystem:") # noqa: T001, T201 for k, stat in sys_info.items(): - print("{k:>10}: {stat}".format(k=k, stat=stat)) # noqa: T001, T201 + print(f"{k:>10}: {stat}") # noqa: T001, T201 print("\nPython dependencies:") # noqa: T001, T201 for k, stat in deps_info.items(): - print("{k:>13}: {stat}".format(k=k, stat=stat)) # noqa: T001, T201 + print(f"{k:>13}: {stat}") # noqa: T001, T201 diff --git a/skpro/utils/_maint/tests/__init__.py b/skpro/utils/_maint/tests/__init__.py index 53994c580..a8b8c99ef 100644 --- a/skpro/utils/_maint/tests/__init__.py +++ b/skpro/utils/_maint/tests/__init__.py @@ -1,2 +1 @@ -# -*- coding: utf-8 -*- """Tests for the show_versions utility.""" diff --git a/skpro/utils/_maint/tests/test_show_versions.py b/skpro/utils/_maint/tests/test_show_versions.py index 5c9b3769c..2e7944d1c 100644 --- a/skpro/utils/_maint/tests/test_show_versions.py +++ b/skpro/utils/_maint/tests/test_show_versions.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Tests for the show_versions utility.""" from skpro.utils._maint._show_versions import ( @@ -19,7 +18,7 @@ def test_deps_info(): """Test that _get_deps_info returns package/version dict as per contract.""" deps_info = _get_deps_info() assert isinstance(deps_info, dict) - assert set(deps_info.keys()) == set(["skpro"]) + assert set(deps_info.keys()) == {"skpro"} deps_info_default = _get_deps_info(DEFAULT_DEPS_TO_SHOW) assert isinstance(deps_info_default, dict) diff --git a/skpro/utils/index.py b/skpro/utils/index.py index 5496dfd33..c9c4cde9f 100644 --- a/skpro/utils/index.py +++ b/skpro/utils/index.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Utility functions for working with indices.""" import numpy as np diff --git a/skpro/utils/plotting.py b/skpro/utils/plotting.py index 4d89ef22e..c249cba7e 100644 --- a/skpro/utils/plotting.py +++ b/skpro/utils/plotting.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Utility functions for plotting.""" import numpy as np import pandas as pd diff --git a/skpro/utils/tests/__init__.py b/skpro/utils/tests/__init__.py index 165478ec6..874535a04 100644 --- a/skpro/utils/tests/__init__.py +++ b/skpro/utils/tests/__init__.py @@ -1,2 +1 @@ -# -*- coding: utf-8 -*- """Tests for utilities.""" diff --git a/skpro/utils/tests/test_plots.py b/skpro/utils/tests/test_plots.py index 68b41c2a5..af0a2a6ce 100644 --- a/skpro/utils/tests/test_plots.py +++ b/skpro/utils/tests/test_plots.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Test functionality of time series plotting functions.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) diff --git a/skpro/utils/utils.py b/skpro/utils/utils.py index be1bc07e4..99955ef34 100644 --- a/skpro/utils/utils.py +++ b/skpro/utils/utils.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor diff --git a/skpro/utils/validation/__init__.py b/skpro/utils/validation/__init__.py index e15f1e4e3..b636a1d7b 100644 --- a/skpro/utils/validation/__init__.py +++ b/skpro/utils/validation/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Validation utilities.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) diff --git a/skpro/utils/validation/_dependencies.py b/skpro/utils/validation/_dependencies.py index 7143f4925..c4283f761 100644 --- a/skpro/utils/validation/_dependencies.py +++ b/skpro/utils/validation/_dependencies.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Utility to check soft dependency imports, and raise warnings or errors.""" __author__ = ["fkiraly", "mloning"] @@ -89,7 +88,6 @@ def _check_soft_dependencies( raise TypeError("obj must be a class, an object, a str, or None") for package in packages: - try: req = Requirement(package) except InvalidRequirement: diff --git a/skpro/workflow/base.py b/skpro/workflow/base.py index 5e0b61005..ecbb3d9cc 100644 --- a/skpro/workflow/base.py +++ b/skpro/workflow/base.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor import abc diff --git a/skpro/workflow/cross_validation.py b/skpro/workflow/cross_validation.py index 4764a9c21..097338f37 100644 --- a/skpro/workflow/cross_validation.py +++ b/skpro/workflow/cross_validation.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor if False: diff --git a/skpro/workflow/manager/data.py b/skpro/workflow/manager/data.py index 16b36de7e..10f04b592 100644 --- a/skpro/workflow/manager/data.py +++ b/skpro/workflow/manager/data.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor import hashlib diff --git a/skpro/workflow/manager/models.py b/skpro/workflow/manager/models.py index 273cc9d37..73dc4d05d 100644 --- a/skpro/workflow/manager/models.py +++ b/skpro/workflow/manager/models.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor from skpro.workflow.base import Model diff --git a/skpro/workflow/table/table.py b/skpro/workflow/table/table.py index 9966a7f37..22e88f9f5 100644 --- a/skpro/workflow/table/table.py +++ b/skpro/workflow/table/table.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor import abc diff --git a/skpro/workflow/utils.py b/skpro/workflow/utils.py index a071dc34d..55a5f1bc2 100644 --- a/skpro/workflow/utils.py +++ b/skpro/workflow/utils.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # LEGACY MODULE - TODO: remove or refactor from .base import Controller, View