From 0ac8a71ee2ad670d7b3a39201f63e1b48938204c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 28 Mar 2023 19:58:02 -0700 Subject: [PATCH 01/39] [EXAMPLE DIFF] (Tree featuresv2) Fork of sklearn that maintains all necessary refactorings to enable downstream functionality (#32) #### Reference Issues/PRs This is the most up-to-date PR branch to consolidate all proposed refactor changes that work with: - unsupervised trees - oblique trees - no performance/runtime regressions against main #### What does this implement/fix? Explain your changes. Incorporates refactors to: Internal Cython of scikit-learn's: - criterion - splitter - tree Internals of Python in scikit-learns: - python Tree Adds the basic implementation of oblique trees. The implementation of oblique trees has been tested on all sklearn's `check_estimator` testing function and has error-checking bounds for the new hyperparameter introduced, which is `feature_combinations` that defaults to ``min(1.5, n_features)``. TODO: 1. [ ] ~Add honest support for trees (splitting the data at the Python API level)~ 2. [x] Build wheels 3. [ ] ~Brainstorm unit-tests, or weekly checks to determine when our fork is out-of-date compared to upstream sklearn~ 4. [x] Revamp README for the fork #### Any other comments? [cd build] --------- Signed-off-by: Adam Li Co-authored-by: Chester Huynh Co-authored-by: Parth Vora --- .circleci/config.yml | 33 +- .cirrus.star | 4 +- .github/workflows/check-changelog.yml | 3 +- .github/workflows/check-manifest.yml | 2 +- .github/workflows/labeler-module.yml | 4 +- .github/workflows/update_tracking_issue.yml | 2 +- .github/workflows/wheels.yml | 33 +- .gitignore | 1 + Makefile | 3 + README.rst | 322 ++++++---- build_tools/azure/install.sh | 2 +- build_tools/azure/install_win.sh | 2 +- doc/Makefile | 2 + doc/conf.py | 3 +- doc/modules/tree.rst | 61 +- examples/tree/plot_iris_dtc.py | 4 - setup.py | 45 +- sklearn/ensemble/_forest.py | 108 +++- sklearn/ensemble/tests/test_forest.py | 171 +++++ sklearn/tree/_classes.py | 162 +++-- sklearn/tree/_criterion.pxd | 45 +- sklearn/tree/_criterion.pyx | 285 ++++----- sklearn/tree/_splitter.pxd | 41 +- sklearn/tree/_splitter.pyx | 165 +++-- sklearn/tree/_tree.pxd | 90 ++- sklearn/tree/_tree.pyx | 659 ++++++++++++-------- sklearn/tree/tests/test_tree.py | 32 +- 27 files changed, 1499 insertions(+), 785 deletions(-) mode change 100755 => 100644 setup.py diff --git a/.circleci/config.yml b/.circleci/config.yml index e2f54c0665c78..e4e66b5c57f49 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -94,22 +94,23 @@ jobs: root: doc/_build/html paths: . - deploy: - docker: - - image: cimg/python:3.8.12 - steps: - - checkout - - run: ./build_tools/circle/checkout_merge_commit.sh - # Attach documentation generated in the 'doc' step so that it can be - # deployed. - - attach_workspace: - at: doc/_build/html - - run: ls -ltrh doc/_build/html/stable - - deploy: - command: | - if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then - bash build_tools/circle/push_doc.sh doc/_build/html/stable - fi + # XXX: in order to make sure our fork passes all the CIs and not remove too many LOC, we don't want to deploy + # deploy: + # docker: + # - image: cimg/python:3.8.12 + # steps: + # - checkout + # - run: ./build_tools/circle/checkout_merge_commit.sh + # # Attach documentation generated in the 'doc' step so that it can be + # # deployed. + # - attach_workspace: + # at: doc/_build/html + # - run: ls -ltrh doc/_build/html/stable + # - deploy: + # command: | + # if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then + # bash build_tools/circle/push_doc.sh doc/_build/html/stable + # fi workflows: version: 2 diff --git a/.cirrus.star b/.cirrus.star index 8b3de0d10c532..2dd1e50144987 100644 --- a/.cirrus.star +++ b/.cirrus.star @@ -4,9 +4,9 @@ load("cirrus", "env", "fs", "http") def main(ctx): - # Only run for scikit-learn/scikit-learn. For debugging on a fork, you can + # Only run for neurodata/scikit-learn. For debugging on a fork, you can # comment out the following condition. - if env.get("CIRRUS_REPO_FULL_NAME") != "scikit-learn/scikit-learn": + if env.get("CIRRUS_REPO_FULL_NAME") != "neurodata/scikit-learn": return [] arm_wheel_yaml = "build_tools/cirrus/arm_wheel.yml" diff --git a/.github/workflows/check-changelog.yml b/.github/workflows/check-changelog.yml index d5bfc8ef0f430..53f64ba5c886b 100644 --- a/.github/workflows/check-changelog.yml +++ b/.github/workflows/check-changelog.yml @@ -10,12 +10,13 @@ jobs: check: name: A reviewer will let you know if it is required or can be bypassed runs-on: ubuntu-latest - if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 }} + if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 && github.repository == 'scikit-learn/scikit-learn' }} steps: - name: Get PR number and milestone run: | echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }}" >> $GITHUB_ENV + echo "${{ github.repository }}" - uses: actions/checkout@v3 with: fetch-depth: '0' diff --git a/.github/workflows/check-manifest.yml b/.github/workflows/check-manifest.yml index 004cc452e385e..5ef9ce2213e90 100644 --- a/.github/workflows/check-manifest.yml +++ b/.github/workflows/check-manifest.yml @@ -7,7 +7,7 @@ on: jobs: check-manifest: # Don't run on forks - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' runs-on: ubuntu-latest steps: diff --git a/.github/workflows/labeler-module.yml b/.github/workflows/labeler-module.yml index 061d0094b38c5..8092711f07e45 100644 --- a/.github/workflows/labeler-module.yml +++ b/.github/workflows/labeler-module.yml @@ -16,7 +16,7 @@ jobs: steps: - uses: thomasjpfan/labeler@v2.5.0 continue-on-error: true - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' with: repo-token: "${{ secrets.GITHUB_TOKEN }}" max-labels: "3" @@ -27,7 +27,7 @@ jobs: steps: - uses: thomasjpfan/labeler@v2.5.0 continue-on-error: true - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' with: repo-token: "${{ secrets.GITHUB_TOKEN }}" configuration-path: ".github/labeler-file-extensions.yml" diff --git a/.github/workflows/update_tracking_issue.yml b/.github/workflows/update_tracking_issue.yml index 124ea1e8c6ac4..c176ce356a4cf 100644 --- a/.github/workflows/update_tracking_issue.yml +++ b/.github/workflows/update_tracking_issue.yml @@ -24,7 +24,7 @@ on: jobs: update_tracking_issue: runs-on: ubuntu-latest - if: github.repository == 'scikit-learn/scikit-learn' && github.event_name == 'schedule' + if: github.repository == 'neurodata/scikit-learn' && github.event_name == 'schedule' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index b43f29ffa4f7f..4ab75fd361586 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -7,12 +7,12 @@ on: - cron: "42 3 */1 * *" push: branches: - - main + - fork # Release branches - "[0-9]+.[0-9]+.X" pull_request: branches: - - main + - fork - "[0-9]+.[0-9]+.X" # Manual run workflow_dispatch: @@ -26,7 +26,7 @@ jobs: check_build_trigger: name: Check build trigger runs-on: ubuntu-latest - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' outputs: build: ${{ steps.check_build_trigger.outputs.build }} @@ -178,31 +178,8 @@ jobs: with: path: dist/*.tar.gz - # Upload the wheels and the source distribution - upload_anaconda: - name: Upload to Anaconda - runs-on: ubuntu-latest - needs: [build_wheels, build_sdist] - # The artifacts cannot be uploaded on PRs - if: github.event_name != 'pull_request' - - steps: - - name: Checkout scikit-learn - uses: actions/checkout@v3 - - - name: Download artifacts - uses: actions/download-artifact@v3 + - uses: actions/upload-artifact@v3 with: path: dist + name: ${{ matrix.python[0] }}-${{ matrix.os[1] }} - - name: Setup Python - uses: actions/setup-python@v4 - - - name: Upload artifacts - env: - # Secret variables need to be mapped to environment variables explicitly - SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN }} - SCIKIT_LEARN_STAGING_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_STAGING_UPLOAD_TOKEN }} - ARTIFACTS_PATH: dist/artifact - # Force a replacement if the remote file already exists - run: bash build_tools/github/upload_anaconda.sh diff --git a/.gitignore b/.gitignore index 89600846100a8..1e28896f50be6 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ build sklearn/datasets/__config__.py sklearn/**/*.html +scikit_learn_tree.egg-info/* dist/ MANIFEST diff --git a/Makefile b/Makefile index 5ea64dc0d6cac..148027b30f59f 100644 --- a/Makefile +++ b/Makefile @@ -63,3 +63,6 @@ doc-noplot: inplace code-analysis: flake8 sklearn | grep -v __init__ | grep -v external pylint -E -i y sklearn/ -d E1103,E0611,E1101 + +build-dev: + pip install --verbose --no-build-isolation --editable . diff --git a/README.rst b/README.rst index 5e2de6a6d8b46..fbdfdaa95ef4c 100644 --- a/README.rst +++ b/README.rst @@ -44,20 +44,36 @@ .. |PytestMinVersion| replace:: 5.3.1 .. |PlotlyMinVersion| replace:: 5.10.0 -.. image:: https://raw.githubusercontent.com/scikit-learn/scikit-learn/main/doc/logos/scikit-learn-logo.png - :target: https://scikit-learn.org/ +``scikit-learn-tree`` is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line +with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is +released under the name ``scikit-learn-tree`` to avoid confusion. -**scikit-learn** is a Python module for machine learning built on top of -SciPy and is distributed under the 3-Clause BSD license. +It is currently maintained by a team of volunteers. -The project was started in 2007 by David Cournapeau as a Google Summer -of Code project, and since then many volunteers have contributed. See -the `About us `__ page -for a list of core contributors. +The upstream package **scikit-learn** is a Python module for machine learning built on top of +SciPy and is distributed under the 3-Clause BSD license. Refer to their website for all documentation +needs: https://scikit-learn.org. -It is currently maintained by a team of volunteers. +Why a fork? +----------- +Currently, the scikit-learn tree submodule is difficult to extend. Requests to modularize +and improve the extensibility of the code is currently unsupported, or may take a long time. +The desire for advanced tree models that also leverage the robustness of scikit-learn is desirable. + +However, "hard-forking" via copy/pasting the explicit Python/Cython code into another tree package +altogether is undesirable because it results in a tree codebase that is inherently different +and not compatible with ``scikit-learn``. For example, `quantile-forests `_, +and `EconML `_ do this, and their current tree submodules +cannot take advantage of improvements made in upstream ``scikit-learn``. + +An example of seamless integration would be `scikit-survival `_, which +only needs to implement a subclass of the Cython ``Criterion`` oject in their code to enable survival trees. -Website: https://scikit-learn.org +Maintaining a "soft-fork" of ``scikit-learn`` in the form of a repository fork allows us to develop +a separate package that serves as a stand-in for ``sklearn`` in any package, extends the tree submodule +and can also be synced with upstream changes in ``scikit-learn``. This enables this fork to always +take advantage of improvements made in ``scikit-learn`` main upstream, while providing a customizable +tree API. Installation ------------ @@ -73,133 +89,195 @@ scikit-learn requires: - joblib (>= |JoblibMinVersion|) - threadpoolctl (>= |ThreadpoolctlMinVersion|) -======= +============================ +Installing scikit-learn-tree +============================ -**Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4.** -scikit-learn 1.0 and later require Python 3.7 or newer. -scikit-learn 1.1 and later require Python 3.8 or newer. +Scikit-learn-tree is a maintained fork of scikit-learn, which extends the +tree submodule in a few ways documented in :ref:`changelog of the fork +`. -Scikit-learn plotting capabilities (i.e., functions start with ``plot_`` and -classes end with "Display") require Matplotlib (>= |MatplotlibMinVersion|). -For running the examples Matplotlib >= |MatplotlibMinVersion| is required. -A few examples require scikit-image >= |Scikit-ImageMinVersion|, a few examples -require pandas >= |PandasMinVersion|, some examples require seaborn >= -|SeabornMinVersion| and plotly >= |PlotlyMinVersion|. +We release versions of scikit-learn-tree in an analagous fashion to +scikit-learn main. Due to maintenance resources, we only release on PyPi +and recommend therefore installing with ``pip``. -User installation -~~~~~~~~~~~~~~~~~ +There are different ways to install scikit-learn-tree: -If you already have a working installation of numpy and scipy, -the easiest way to install scikit-learn is using ``pip``:: + * :ref:`Install the latest official release `. This + is the best approach for most users. It will provide a stable version + and pre-built packages are available for most platforms. + + * :ref:`Building the package from source + `. This is best for users who want the + latest-and-greatest features and aren't afraid of running + brand-new code. This is also needed for users who wish to contribute to the + project. - pip install -U scikit-learn +.. _install_fork_release: -or ``conda``:: +Installing the latest release +----------------------------- +We release wheels for common distributions and this is thus installable via pip. - conda install -c conda-forge scikit-learn +.. prompt:: bash $ + + pip install scikit-learn-tree -The documentation includes more detailed `installation instructions `_. +This will install ``scikit-learn-tree`` under the namespace of ``sklearn``, which then +can be used as a stand-in for any package that relies on the public API of ``sklearn``. +For example, any usage of ``scikit-learn`` is preserved with ``scikit-learn-tree`` -Changelog ---------- + >>> # the sklearn installed is that of scikit-learn-tree and is equivalent to scikit-learn + >>> from sklearn.ensemble import RandomForestClassifier + >>> clf = RandomForestClassifier(random_state=0) + >>> X = [[ 1, 2, 3], # 2 samples, 3 features + ... [11, 12, 13]] + >>> y = [0, 1] # classes of each sample + >>> clf.fit(X, y) + RandomForestClassifier(random_state=0) -See the `changelog `__ -for a history of notable changes to scikit-learn. +.. _install_source: + +Building from source +-------------------- +If you are a developer and are interested in helping maintain, or add some new +features to the fork, the building from source instructions are exactly the same +as that of scikit-learn main, so please refer to `scikit-learn documentation `_ +for instructions on building from source. Development ------------ +=========== -We welcome new contributors of all experience levels. The scikit-learn -community goals are to be helpful, welcoming, and effective. The +We welcome new contributors of all experience levels, specifically to maintain the fork. +Any contributions that make sure our fork is "better in-line" with scikit-learn upstream, +or improves the tree submodule in anyway will be appreciated. + +The scikit-learn community goals are to be helpful, welcoming, and effective. The `Development Guide `_ has detailed information about contributing code, documentation, tests, and more. We've included some basic information in this README. -Important links -~~~~~~~~~~~~~~~ - -- Official source code repo: https://github.com/scikit-learn/scikit-learn -- Download releases: https://pypi.org/project/scikit-learn/ -- Issue tracker: https://github.com/scikit-learn/scikit-learn/issues - -Source code -~~~~~~~~~~~ - -You can check the latest sources with the command:: - - git clone https://github.com/scikit-learn/scikit-learn.git - -Contributing -~~~~~~~~~~~~ - -To learn more about making a contribution to scikit-learn, please see our -`Contributing guide -`_. - -Testing -~~~~~~~ - -After installation, you can launch the test suite from outside the source -directory (you will need to have ``pytest`` >= |PyTestMinVersion| installed):: - - pytest sklearn - -See the web page https://scikit-learn.org/dev/developers/contributing.html#testing-and-improving-test-coverage -for more information. - - Random number generation can be controlled during testing by setting - the ``SKLEARN_SEED`` environment variable. - -Submitting a Pull Request -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Before opening a Pull Request, have a look at the -full Contributing page to make sure your code complies -with our guidelines: https://scikit-learn.org/stable/developers/index.html - -Project History ---------------- - -The project was started in 2007 by David Cournapeau as a Google Summer -of Code project, and since then many volunteers have contributed. See -the `About us `__ page -for a list of core contributors. - -The project is currently maintained by a team of volunteers. - -**Note**: `scikit-learn` was previously referred to as `scikits.learn`. - -Help and Support ----------------- - -Documentation -~~~~~~~~~~~~~ - -- HTML documentation (stable release): https://scikit-learn.org -- HTML documentation (development version): https://scikit-learn.org/dev/ -- FAQ: https://scikit-learn.org/stable/faq.html - -Communication -~~~~~~~~~~~~~ - -- Mailing list: https://mail.python.org/mailman/listinfo/scikit-learn -- Gitter: https://gitter.im/scikit-learn/scikit-learn -- Logos & Branding: https://github.com/scikit-learn/scikit-learn/tree/main/doc/logos -- Blog: https://blog.scikit-learn.org -- Calendar: https://blog.scikit-learn.org/calendar/ -- Twitter: https://twitter.com/scikit_learn -- Twitter (commits): https://twitter.com/sklearn_commits -- Stack Overflow: https://stackoverflow.com/questions/tagged/scikit-learn -- Github Discussions: https://github.com/scikit-learn/scikit-learn/discussions -- Website: https://scikit-learn.org -- LinkedIn: https://www.linkedin.com/company/scikit-learn -- YouTube: https://www.youtube.com/channel/UCJosFjYm0ZYVUARxuOZqnnw/playlists -- Facebook: https://www.facebook.com/scikitlearnofficial/ -- Instagram: https://www.instagram.com/scikitlearnofficial/ -- TikTok: https://www.tiktok.com/@scikit.learn - -Citation -~~~~~~~~ - -If you use scikit-learn in a scientific publication, we would appreciate citations: https://scikit-learn.org/stable/about.html#citing-scikit-learn +.. _fork-changelog: +Major Changes of the Fork +========================= + +The purpose of this page is to illustrate some of the main features that +``scikit-learn-tree`` provides compared to ``scikit-learn``. It assumes a +an understanding of core package ``scikit-learn`` and also decision trees +models. Please refer to our :ref:`installation instructions +` for installing ``scikit-learn-tree``. + +Scikit-learn-tree though operates as a stand-in for upstream ``scikit-learn``. +It is used in packages exactly the same way and will support all features +in the corresponding version of ``scikit-learn``. For example, if you +are interested in features of ``scikit-learn`` in v1.2.2 for ``NearestNeighbors`` algorithm, +then if ``scikit-learn-tree`` has a version release of v1.2.2, then it will have +all those features. + +The breaking API changes will be with respect to anything in the ``tree`` submodule, +and related Forest ensemble models. See below for a detailed list of breaking changes. + +See: https://scikit-learn.org/ for documentation on scikit-learn main. + +Our Philosophy +-------------- +Our design philosophy with this fork of ``scikit-learn`` is to maintain as few changes +as possible, such that incorporating upstream changes into the fork requires minimal effort. + +Candidate changes and PRs accepted into the fork are those that: + +- improve compatability with upstream ``scikit-learn`` main +- enable improved extensibility of tree models + +Decision tree generalizations +----------------------------- + +``Scikit-learn`` provides an axis-aligned :class:`~sklearn.tree.DecisionTreeClassifier` +decision tree model (classifier and regressor), which has a few fundamental limitations +that prevent 3rd parties from utilizing the existing class, without forking a large +amount of copy/pasted Python and Cython code. We highlight those limitations here +and then describe how we generalize that limitation. + +Cython Internal Private API: + +Note, the Cython API for scikit-learn is still not a publicly supported API, so it may +change without warning. + +- leaf and split nodes: These nodes are treated the same way and there is no internal + API for setting them differently. Quantile trees and causal trees inherently generalize + how leaf nodes are set. +- Criterion class: The criterion class currently assumes a supervised learning interface. + - Our fix: We implement a ``BaseCriterion`` object that provides an abstract API for unsupervised criterion. +- Splitter class: The splitter clas currently assumes a supervised learning interface and + does not provide a way of generalizing the way split candidates are proposed. + - Our fix: We implement a ``BaseSplitter`` object that provides an abstract API for unsupervised splitters and also implement an API to allow generalizations of the ``SplitRecord`` struct and ``Splitter.node_split`` function. For example, this enables oblique splits to be considered. +- Tree class: The tree class currently assumes a supervised learning interface and does not + provide a way of generalizing the type of tree. + - Our fix: We implementa ``BaseTree`` object that provides an abstract API for general tree models and also implement an API that allows generalization of the type of tree. For example, oblique trees are trivially implementable as an extension now. +- stopping conditions for splitter: Currently, the ``Splitter.node_split`` function has various + stopping conditions for the splitter based on hyperparameters. It is plausible that these conditions + may be extended. For example, in causal trees, one may want the splitter to also account for + a minimal degree of heterogeneity (i.e. variance) in its children nodes. + +Python API: + +- ``sklearn.tree.BaseDecisionTree`` assumes the underlying tree model is supervised: The ``y`` + parameter is required to be passed in, which is not necessary for general tree-based models. + For example, an unsupervised tree may pass in ``y=None``. + - Our fix: We fix this API, so the ``BaseDecisionTree`` is subclassable by unsupervised tree models that do not require ``y`` to be defined. +- ``sklearn.tree.BaseDecisionTree`` does not provide a way to generalize the ``Criterion``, ``Splitter`` + and ``Tree`` Cython classes used: The current codebase requires users to define custom + criterion and/or splitters outside the instantiation of the ``BaseDecisionTree``. This prevents + users from generalizing the ``Criterion`` and ``Splitter`` and creating a neat Python API wrapper. + Moreover, the ``Tree`` class is not customizable. + - Our fix: We internally implement a private function to actually build the entire tree, ``BaseDecisionTree._build_tree``, which can be overridden in subclasses that customize the criterion, splitter, or tree, or any combination of them. +- ``sklearn.ensemble.BaseForest`` and its subclass algorithms are slow when ``n_samples`` is very high. Binning + features into a histogram, which is the basis of "LightGBM" and "HistGradientBoostingClassifier" is a computational + trick that can both significantly increase runtime efficiency, but also help prevent overfitting in trees, since + the sorting in "BestSplitter" is done on bins rather than the continuous feature values. This would enable + random forests and their variants to scale to millions of samples. + - Our fix: We added a ``max_bins=None`` keyword argument to the ``BaseForest`` class, and all its subclasses. The default behavior is no binning. The current implementation is not necessarily efficient. There are several improvements to be made. See below. + +Overall, the existing tree models, such as :class:`~sklearn.tree.DecisionTreeClassifier` +and :class:`~sklearn.ensemble.RandomForestClassifier` all work exactly the same as they +would in ``scikit-learn`` main, but these extensions enable 3rd-party packages to extend +the Cython/Python API easily. + +Roadmap +------- +There are several improvements that can be made in this fork. Primarily, the binning feature +promises to make Random Forests and their variants ultra-fast. However, the binning needs +to be implemented in a similar fashion to ``HistGradientBoostingClassifier``, which passes +in the binning thresholds throughout the tree construction step, such that the split nodes +store the actual numerical value of the bin rather than the "bin index". This requires +modifying the tree Cython code to take in a ``binning_thresholds`` parameter that is part +of the ``_BinMapper`` fitted class. This also allows us not to do any binning during prediction/apply +time because the tree already stores the "numerical" threshold value we would want to apply +to any incoming ``X`` that is not binned. + +Besides that modification, the tree and splitter need to be able to handle not just ``np.float32`` +data (the type for X normally in Random Forests), but also ``uint8`` data (the type for X when it +is binned in to e.g. 255 bins). This would not only save RAM since ``uint8`` storage of millions +of samples would result in many GB saved, but also improved runtime. + +So in summary, the Cython code of the tree submodule needs to take in an extra parameter for +the binning thresholds if binning occurs and also be able to handle ``X`` being of dtype ``uint8``. +Afterwards, Random Forests will have fully leveraged the binning feature. + +Something to keep in mind is that upstream scikit-learn is actively working on incorporating +missing-value handling and categorical handling into Random Forests. + +Next steps +---------- + +We have briefly covered how the tree submodule has changed with respect to ``scikit-learn``. +This enables packages to leverage these changes in developing more complex tree models +that may, or may not eventually be PRed into ``scikit-learn``. For example, + +- `scikit-tree `_ is a scikit-learn + compatible package for more complex and advanced tree models. + +If you are developing tree models, we encourage you to take a look at that package, or +if you have suggestions to make the tree submodule of our fork, ``scikit-learn-tree`` +more \ No newline at end of file diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 5238cd1121d2e..db5b5d9414053 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -7,7 +7,7 @@ set -x source build_tools/shared.sh UNAMESTR=`uname` -CCACHE_LINKS_DIR="/tmp/ccache" +CCACHE_LINKS_DIR="/tmp/ccachev2" setup_ccache() { CCACHE_BIN=`which ccache || echo ""` diff --git a/build_tools/azure/install_win.sh b/build_tools/azure/install_win.sh index ab559a1878971..011e962885d45 100755 --- a/build_tools/azure/install_win.sh +++ b/build_tools/azure/install_win.sh @@ -22,4 +22,4 @@ show_installed_libraries python setup.py bdist_wheel # Install the generated wheel package to test it -pip install --pre --no-index --find-links dist scikit-learn +pip install --pre --no-index --find-links dist scikit-learn-tree diff --git a/doc/Makefile b/doc/Makefile index b56a1289cd581..c728bbbfd033e 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -53,6 +53,8 @@ html: @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable" +# rm $(BUILDDIR)/html/stable/index.html +# mv $(BUILDDIR)/html/stable/fork_index.html $(BUILDDIR)/html/stable/index.html html-noplot: $(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable @echo diff --git a/doc/conf.py b/doc/conf.py index 52b084b331c8c..01e0a332dd54f 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -103,7 +103,8 @@ # source_encoding = 'utf-8' # The main toctree document. -root_doc = "contents" +# root_doc = "contents" +root_doc = "index" # General information about the project. project = "scikit-learn" diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst index 789b0bab616ca..7fa12fd16d487 100644 --- a/doc/modules/tree.rst +++ b/doc/modules/tree.rst @@ -141,7 +141,7 @@ Once trained, you can plot the tree with the :func:`plot_tree` function:: >>> tree.plot_tree(clf) [...] -.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_002.png +.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_003.png :target: ../auto_examples/tree/plot_iris_dtc.html :scale: 75 :align: center @@ -331,6 +331,8 @@ total cost over the entire trees (by summing the cost at each node) of :math:`O(n_{features}n_{samples}^{2}\log(n_{samples}))`. +.. _tree_tips_usage: + Tips on practical use ===================== @@ -612,11 +614,66 @@ be pruned. This process stops when the pruned tree's minimal * :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` +Classification, regression and multi-output problems +---------------------------------------------------- + +OTs can be used for both classification and regression, and can handle multi-output +problems in the same manner as DTs. + +Complexity +---------- + +The run time cost to construct an OT will be similar to that of a DT, with the +added complexity of a (possibly sparse) matrix multiplication to combine random +data columns into candidate split values. The cost at each node is +:math:`O(n_{features}n_{samples}\log(n_{samples}) + n_{features}n_{samples}max\_features \lambda)` +where the additional :math:`n_{features}n_{samples}max\_features \lambda` term +comes from the (possibly sparse) matrix multiplication controlled by both the +number of candidate splits to generate ("max_features") and the sparsity of +the projection matrix that combines the data features (":math:`\lambda`"). + +Another consideration is space-complexity. + +Space-complexity and storing the OT pickled on disc is also a consideration. OTs +at every node need to store an additional vector of feature indices and vector of +feature weights that are used together to form the candidate splits. + +Tips on practical use +--------------------- + +Similar to DTs, the intuition for most parameters are the same. Therefore refer +to :ref:`tips for using decision trees ` for information on standard +tree parameters. Specific parameters, such as ``max_features`` and +``feature_combinations`` are different or special to OTs. + + * As specified earlier, ``max_features`` is not constrained to ``n_features`` + as it is in DTs. Setting ``max_features`` higher requires more computation time because + the algorithm needs to sample more candidate splits at every node. However, it also possibly + lets the user to sample more informative splits, thereby improving the model fit. This + presents a tradeoff between runtime resources and improvements to the model. In practice, + we found that sampling more splits, say up to ``max_features=n_features**2``, is desirable + if one is willing to spend the computational resources. + + * ``feature_combinations`` is the :math:`\lambda` term presented in the complexity + analysis, which specifies how sparse our combination of features is. If + ``feature_combinations=n_features``, then OT is the ``Forest-RC`` version. However, + in practice, ``feature_combinations`` can be set much lower, therefore improving runtime + and storage complexity. + +Finally, when asking the question of when to use OTs vs DTs, scikit-learn recommends +always trying both model using some type of cross-validation procedure and hyperparameter +optimization (e.g. `GridSearchCV`). If one has prior knowledge about how the data is +distributed along its features, such as data being axis-aligned, then one might use a DT. +Other considerations are runtime and space complexity. + .. topic:: References: .. [BRE] L. Breiman, J. Friedman, R. Olshen, and C. Stone. Classification and Regression Trees. Wadsworth, Belmont, CA, 1984. - + + .. [RF] L. Breiman. Random Forests. Machine Learning 45, 5–32 (2001). + https://doi.org/10.1023/A:1010933404324. + * https://en.wikipedia.org/wiki/Decision_tree_learning * https://en.wikipedia.org/wiki/Predictive_analytics diff --git a/examples/tree/plot_iris_dtc.py b/examples/tree/plot_iris_dtc.py index 14f6506b5810f..0dcca718bc6f0 100644 --- a/examples/tree/plot_iris_dtc.py +++ b/examples/tree/plot_iris_dtc.py @@ -2,16 +2,12 @@ ======================================================================= Plot the decision surface of decision trees trained on the iris dataset ======================================================================= - Plot the decision surface of a decision tree trained on pairs of features of the iris dataset. - See :ref:`decision tree ` for more information on the estimator. - For each pair of iris features, the decision tree learns decision boundaries made of combinations of simple thresholding rules inferred from the training samples. - We also show the tree structure of a model built on all of the features. """ # %% diff --git a/setup.py b/setup.py old mode 100755 new mode 100644 index f5522600f623f..e39e39455b7bc --- a/setup.py +++ b/setup.py @@ -30,19 +30,19 @@ builtins.__SKLEARN_SETUP__ = True -DISTNAME = "scikit-learn" -DESCRIPTION = "A set of python modules for machine learning and data mining" +DISTNAME = "scikit-learn-tree" +DESCRIPTION = "A maintained fork of scikit-learn that extends the tree submodule." with open("README.rst") as f: LONG_DESCRIPTION = f.read() -MAINTAINER = "Andreas Mueller" -MAINTAINER_EMAIL = "amueller@ais.uni-bonn.de" +MAINTAINER = "Adam Li" +MAINTAINER_EMAIL = "adam.li@columbia.edu" URL = "http://scikit-learn.org" -DOWNLOAD_URL = "https://pypi.org/project/scikit-learn/#files" +DOWNLOAD_URL = "https://pypi.org/project/scikit-learn-tree/#files" LICENSE = "new BSD" PROJECT_URLS = { - "Bug Tracker": "https://github.com/scikit-learn/scikit-learn/issues", + "Bug Tracker": "https://github.com/neurodata/scikit-learn/issues", "Documentation": "https://scikit-learn.org/stable/documentation.html", - "Source Code": "https://github.com/scikit-learn/scikit-learn", + "Source Code": "https://github.com/neurodata/scikit-learn", } # We can actually import a restricted version of sklearn that @@ -170,11 +170,11 @@ def check_package_status(package, min_version): package_status["up_to_date"] = False package_status["version"] = "" - req_str = "scikit-learn requires {} >= {}.\n".format(package, min_version) + req_str = "scikit-learn-tree requires {} >= {}.\n".format(package, min_version) instructions = ( "Installation instructions are available on the " - "scikit-learn website: " + "scikit-learn-tree website: " "http://scikit-learn.org/stable/install.html\n" ) @@ -221,10 +221,10 @@ def check_package_status(package, min_version): {"sources": ["_cdnmf_fast.pyx"], "include_np": True}, ], "ensemble": [ - {"sources": ["_gradient_boosting.pyx"], "include_np": True}, + {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True}, ], "ensemble._hist_gradient_boosting": [ - {"sources": ["_gradient_boosting.pyx"], "include_np": True}, + {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True}, {"sources": ["histogram.pyx"], "include_np": True}, {"sources": ["splitting.pyx"], "include_np": True}, {"sources": ["_binning.pyx"], "include_np": True}, @@ -306,7 +306,7 @@ def check_package_status(package, min_version): {"sources": ["_ball_tree.pyx"], "include_np": True}, {"sources": ["_kd_tree.pyx"], "include_np": True}, {"sources": ["_partition_nodes.pyx"], "language": "c++", "include_np": True}, - {"sources": ["_quad_tree.pyx"], "include_np": True}, + {"sources": ["_quad_tree.pyx"], "language": "c++", "include_np": True}, ], "svm": [ { @@ -374,9 +374,24 @@ def check_package_status(package, min_version): "include_np": True, "optimization_level": "O3", }, - {"sources": ["_splitter.pyx"], "include_np": True, "optimization_level": "O3"}, - {"sources": ["_criterion.pyx"], "include_np": True, "optimization_level": "O3"}, - {"sources": ["_utils.pyx"], "include_np": True, "optimization_level": "O3"}, + { + "sources": ["_splitter.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, + { + "sources": ["_criterion.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, + { + "sources": ["_utils.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, ], "utils": [ {"sources": ["sparsefuncs_fast.pyx"], "include_np": True}, diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 19203da4fce1f..a3c29e4a269ce 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -40,6 +40,7 @@ class calls the ``fit`` method of each sub-estimator on random samples # License: BSD 3 clause +from time import time from numbers import Integral, Real from warnings import catch_warnings, simplefilter, warn import threading @@ -72,10 +73,11 @@ class calls the ``fit`` method of each sub-estimator on random samples _check_sample_weight, _check_feature_names_in, ) +from ..utils._openmp_helpers import _openmp_effective_n_threads from ..utils.validation import _num_samples from ..utils._param_validation import Interval, StrOptions from ..utils._param_validation import RealNotInt - +from ._hist_gradient_boosting.binning import _BinMapper __all__ = [ "RandomForestClassifier", @@ -210,6 +212,10 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): Interval(RealNotInt, 0.0, 1.0, closed="right"), Interval(Integral, 1, None, closed="left"), ], + "max_bins": [ + None, + Interval(Integral, 1, None, closed="left"), + ], } @abstractmethod @@ -228,6 +234,7 @@ def __init__( class_weight=None, max_samples=None, base_estimator="deprecated", + max_bins=None, ): super().__init__( estimator=estimator, @@ -244,6 +251,7 @@ def __init__( self.warm_start = warm_start self.class_weight = class_weight self.max_samples = max_samples + self.max_bins = max_bins def apply(self, X): """ @@ -263,6 +271,15 @@ def apply(self, X): return the index of the leaf x ends up in. """ X = self._validate_X_predict(X) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + results = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, @@ -420,6 +437,38 @@ def fit(self, X, y, sample_weight=None): n_more_estimators = self.n_estimators - len(self.estimators_) + if self.max_bins is not None: + # `_openmp_effective_n_threads` is used to take cgroups CPU quotes + # into account when determine the maximum number of threads to use. + n_threads = _openmp_effective_n_threads() + + # Bin the data + # For ease of use of the API, the user-facing GBDT classes accept the + # parameter max_bins, which doesn't take into account the bin for + # missing values (which is always allocated). However, since max_bins + # isn't the true maximal number of bins, all other private classes + # (binmapper, histbuilder...) accept n_bins instead, which is the + # actual total number of bins. Everywhere in the code, the + # convention is that n_bins == max_bins + 1 + n_bins = self.max_bins + 1 # + 1 for missing values + self._bin_mapper = _BinMapper( + n_bins=n_bins, + # is_categorical=self.is_categorical_, + known_categories=None, + random_state=random_state, + n_threads=n_threads, + ) + + # XXX: in order for this to work with the underlying tree submodule's Cython + # code, we need to convert this into the original data's DTYPE because + # the Cython code assumes that `DTYPE` is used. + # The proper implementation will be a lot more complicated and should be + # tackled once scikit-learn has finalized their inclusion of missing data + # and categorical support for decision trees + X = self._bin_data(X, is_training_data=True) # .astype(DTYPE) + else: + self._bin_mapper = None + if n_more_estimators < 0: raise ValueError( "n_estimators=%d must be larger or equal to " @@ -628,6 +677,35 @@ def feature_importances_(self): all_importances = np.mean(all_importances, axis=0, dtype=np.float64) return all_importances / np.sum(all_importances) + def _bin_data(self, X, is_training_data): + """Bin data X. + + If is_training_data, then fit the _bin_mapper attribute. + Else, the binned data is converted to a C-contiguous array. + """ + + description = "training" if is_training_data else "validation" + if self.verbose: + print( + "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description), + end="", + flush=True, + ) + tic = time() + if is_training_data: + X_binned = self._bin_mapper.fit_transform(X) # F-aligned array + else: + X_binned = self._bin_mapper.transform(X) # F-aligned array + # We convert the array to C-contiguous since predicting is faster + # with this layout (training is faster on F-arrays though) + X_binned = np.ascontiguousarray(X_binned) + toc = time() + if self.verbose: + duration = toc - tic + print("{:.3f} s".format(duration)) + + return X_binned + def _accumulate_prediction(predict, X, out, lock): """ @@ -669,6 +747,7 @@ def __init__( class_weight=None, max_samples=None, base_estimator="deprecated", + max_bins=None, ): super().__init__( estimator=estimator, @@ -683,6 +762,7 @@ def __init__( class_weight=class_weight, max_samples=max_samples, base_estimator=base_estimator, + max_bins=max_bins, ) @staticmethod @@ -856,6 +936,14 @@ def predict_proba(self, X): # Check data X = self._validate_X_predict(X) + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -937,6 +1025,7 @@ def __init__( warm_start=False, max_samples=None, base_estimator="deprecated", + max_bins=None, ): super().__init__( estimator, @@ -950,6 +1039,7 @@ def __init__( warm_start=warm_start, max_samples=max_samples, base_estimator=base_estimator, + max_bins=max_bins, ) def predict(self, X): @@ -975,6 +1065,14 @@ def predict(self, X): # Check data X = self._validate_X_predict(X) + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -1399,6 +1497,7 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, + max_bins=None, ): super().__init__( estimator=DecisionTreeClassifier(), @@ -1423,6 +1522,7 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, ) self.criterion = criterion @@ -1734,6 +1834,7 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, + max_bins=None, ): super().__init__( estimator=DecisionTreeRegressor(), @@ -1757,6 +1858,7 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, ) self.criterion = criterion @@ -2084,6 +2186,7 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, + max_bins=None, ): super().__init__( estimator=ExtraTreeClassifier(), @@ -2108,6 +2211,7 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, ) self.criterion = criterion @@ -2406,6 +2510,7 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, + max_bins=None, ): super().__init__( estimator=ExtraTreeRegressor(), @@ -2429,6 +2534,7 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, ) self.criterion = criterion diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 9bf0bb2becd9b..0150340f24bc6 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -118,6 +118,120 @@ FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS) +def _sparse_parity(n, p=20, p_star=3, random_state=None): + """Generate sparse parity dataset. + + Sparse parity is a multivariate generalization of the + XOR problem. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset, by default 20 + p_star : int, optional + The number of informative dimensions, by default 3. + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Sparse parity dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + """ + rng = np.random.RandomState(seed=random_state) + X = rng.uniform(-1, 1, (n, p)) + y = np.zeros(n) + + for i in range(0, n): + y[i] = sum(X[i, :p_star] > 0) % 2 + + return X, y + + +def _orthant(n, p=6, random_state=None): + """Generate orthant dataset. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset and the number of + unique labels, by default 6. + rec : int, optional + _description_, by default 1 + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Orthant dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + """ + rng = np.random.RandomState(seed=random_state) + orth_labels = np.asarray([2**i for i in range(0, p)][::-1]) + + X = rng.uniform(-1, 1, (n, p)) + y = np.zeros(n) + + for i in range(0, n): + idx = np.where(X[i, :] > 0)[0] + y[i] = sum(orth_labels[idx]) + + if len(np.unique(y)) < 2**p: + raise RuntimeError("Increase sample size to get a label in each orthant.") + + return X, y + + +def _trunk(n, p=10, random_state=None): + """Generate trunk dataset. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset and the number of + unique labels, by default 10. + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Trunk dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + + References + ---------- + [1] Gerard V. Trunk. A problem of dimensionality: A + simple example. IEEE Transactions on Pattern Analysis + and Machine Intelligence, 1(3):306–307, 1979. + """ + rng = np.random.RandomState(seed=random_state) + + mu_1 = np.array([1 / i for i in range(1, p + 1)]) + mu_0 = -1 * mu_1 + cov = np.identity(p) + + X = np.vstack( + ( + rng.multivariate_normal(mu_0, cov, int(n / 2)), + rng.multivariate_normal(mu_1, cov, int(n / 2)), + ) + ) + y = np.concatenate((np.zeros(int(n / 2)), np.ones(int(n / 2)))) + return X, y + + def check_classification_toy(name): """Check classification on a toy dataset.""" ForestClassifier = FOREST_CLASSIFIERS[name] @@ -1791,3 +1905,60 @@ def test_round_samples_to_one_when_samples_too_low(class_weight): n_estimators=10, max_samples=1e-4, class_weight=class_weight, random_state=0 ) forest.fit(X, y) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_classification_toy_withbins(name): + """Check classification on a toy dataset.""" + ForestClassifier = FOREST_CLASSIFIERS[name] + + clf = ForestClassifier(n_estimators=10, random_state=1, max_bins=255) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf) + + clf = ForestClassifier( + n_estimators=10, max_features=1, random_state=1, max_bins=255 + ) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf) + + # also test apply + leaf_indices = clf.apply(X) + assert leaf_indices.shape == (len(X), clf.n_estimators) + + +@pytest.mark.parametrize("name", FOREST_REGRESSORS) +@pytest.mark.parametrize( + "criterion", ("squared_error", "absolute_error", "friedman_mse") +) +def test_regression_criterion_withbins(name, criterion): + # Check consistency on regression dataset. + ForestRegressor = FOREST_REGRESSORS[name] + + reg = ForestRegressor( + n_estimators=5, criterion=criterion, random_state=1, max_bins=250 + ) + reg.fit(X_reg, y_reg) + score = reg.score(X_reg, y_reg) + assert ( + score > 0.93 + ), "Failed with max_features=None, criterion %s and score = %f" % ( + criterion, + score, + ) + + reg = ForestRegressor( + n_estimators=5, + criterion=criterion, + max_features=6, + random_state=1, + max_bins=250, + ) + reg.fit(X_reg, y_reg) + score = reg.score(X_reg, y_reg) + assert score > 0.92, "Failed with max_features=6, criterion %s and score = %f" % ( + criterion, + score, + ) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index b175275ea92dc..bd54483bf2dfe 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -40,8 +40,8 @@ from ..utils._param_validation import Hidden, Interval, StrOptions from ..utils._param_validation import RealNotInt -from ._criterion import Criterion -from ._splitter import Splitter +from ._criterion import BaseCriterion +from ._splitter import BaseSplitter from ._tree import DepthFirstTreeBuilder from ._tree import BestFirstTreeBuilder from ._tree import Tree @@ -174,7 +174,7 @@ def get_n_leaves(self): check_is_fitted(self) return self.tree_.n_leaves - def fit(self, X, y, sample_weight=None, check_input=True): + def fit(self, X, y=None, sample_weight=None, check_input=True): self._validate_params() random_state = check_random_state(self.random_state) @@ -184,9 +184,12 @@ def fit(self, X, y, sample_weight=None, check_input=True): # csr. check_X_params = dict(dtype=DTYPE, accept_sparse="csc") check_y_params = dict(ensure_2d=False, dtype=None) - X, y = self._validate_data( - X, y, validate_separately=(check_X_params, check_y_params) - ) + if y is not None or self._get_tags()["requires_y"]: + X, y = self._validate_data( + X, y, validate_separately=(check_X_params, check_y_params) + ) + else: + X = self._validate_data(X, **check_X_params) if issparse(X): X.sort_indices() @@ -195,7 +198,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): "No support for np.int64 index based sparse matrices" ) - if self.criterion == "poisson": + if y is not None and self.criterion == "poisson": if np.any(y < 0): raise ValueError( "Some value(s) of y are negative which is" @@ -209,45 +212,56 @@ def fit(self, X, y, sample_weight=None, check_input=True): # Determine output settings n_samples, self.n_features_in_ = X.shape - is_classification = is_classifier(self) - y = np.atleast_1d(y) - expanded_class_weight = None + # Do preprocessing if 'y' is passed + is_classification = False + if y is not None: + is_classification = is_classifier(self) + + y = np.atleast_1d(y) + expanded_class_weight = None - if y.ndim == 1: - # reshape is necessary to preserve the data contiguity against vs - # [:, np.newaxis] that does not. - y = np.reshape(y, (-1, 1)) + if y.ndim == 1: + # reshape is necessary to preserve the data contiguity against vs + # [:, np.newaxis] that does not. + y = np.reshape(y, (-1, 1)) - self.n_outputs_ = y.shape[1] + self.n_outputs_ = y.shape[1] - if is_classification: - check_classification_targets(y) - y = np.copy(y) + if is_classification: + check_classification_targets(y) + y = np.copy(y) - self.classes_ = [] - self.n_classes_ = [] + self.classes_ = [] + self.n_classes_ = [] - if self.class_weight is not None: - y_original = np.copy(y) + if self.class_weight is not None: + y_original = np.copy(y) - y_encoded = np.zeros(y.shape, dtype=int) - for k in range(self.n_outputs_): - classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) - self.classes_.append(classes_k) - self.n_classes_.append(classes_k.shape[0]) - y = y_encoded - - if self.class_weight is not None: - expanded_class_weight = compute_sample_weight( - self.class_weight, y_original - ) + y_encoded = np.zeros(y.shape, dtype=int) + for k in range(self.n_outputs_): + classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) + self.classes_.append(classes_k) + self.n_classes_.append(classes_k.shape[0]) + y = y_encoded + + if self.class_weight is not None: + expanded_class_weight = compute_sample_weight( + self.class_weight, y_original + ) - self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) + self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) - if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: - y = np.ascontiguousarray(y, dtype=DOUBLE) + if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) + if len(y) != n_samples: + raise ValueError( + "Number of labels=%d does not match number of samples=%d" + % (len(y), n_samples) + ) + + # set decision-tree model parameters max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth if isinstance(self.min_samples_leaf, numbers.Integral): @@ -299,16 +313,10 @@ def fit(self, X, y, sample_weight=None, check_input=True): max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes - if len(y) != n_samples: - raise ValueError( - "Number of labels=%d does not match number of samples=%d" - % (len(y), n_samples) - ) - if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) - if expanded_class_weight is not None: + if y is not None and expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: @@ -320,10 +328,63 @@ def fit(self, X, y, sample_weight=None, check_input=True): else: min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) + # build the actual tree now with the parameters + self._build_tree( + X, + y, + sample_weight, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ) + + return self + + def _build_tree( + self, + X, + y, + sample_weight, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ): + """Build the actual tree. + + Parameters + ---------- + X : Array-like + X dataset. + y : Array-like + Y targets. + sample_weight : Array-like + Sample weights + min_samples_leaf : float + Number of samples required to be a leaf. + min_weight_leaf : float + Weight of samples required to be a leaf. + max_leaf_nodes : float + Maximum number of leaf nodes allowed in tree. + min_samples_split : float + Minimum number of samples to split on. + max_depth : int + The maximum depth of any tree. + random_state : int + Random seed. + """ + + n_samples = X.shape[0] + # Build tree criterion = self.criterion - if not isinstance(criterion, Criterion): - if is_classification: + if not isinstance(criterion, BaseCriterion): + if is_classifier(self): criterion = CRITERIA_CLF[self.criterion]( self.n_outputs_, self.n_classes_ ) @@ -337,7 +398,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS splitter = self.splitter - if not isinstance(self.splitter, Splitter): + if not isinstance(self.splitter, BaseSplitter): splitter = SPLITTERS[self.splitter]( criterion, self.max_features_, @@ -385,8 +446,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): self._prune_tree() - return self - def _validate_X_predict(self, X, check_input): """Validate the training data on predict (probabilities).""" if check_input: @@ -817,7 +876,10 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): _parameter_constraints: dict = { **BaseDecisionTree._parameter_constraints, - "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)], + "criterion": [ + StrOptions({"gini", "entropy", "log_loss"}), + Hidden(BaseCriterion), + ], "class_weight": [dict, list, StrOptions({"balanced"}), None], } @@ -1173,7 +1235,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): **BaseDecisionTree._parameter_constraints, "criterion": [ StrOptions({"squared_error", "friedman_mse", "absolute_error", "poisson"}), - Hidden(Criterion), + Hidden(BaseCriterion), ], } diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 47f616c6bad50..2e179e78e8c3f 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -4,6 +4,8 @@ # Joel Nothman # Arnaud Joly # Jacob Schreiber +# Adam Li +# Jong Shin # # License: BSD 3 clause @@ -15,13 +17,11 @@ from ._tree cimport SIZE_t # Type for indices and counters from ._tree cimport INT32_t # Signed 32 bit integer from ._tree cimport UINT32_t # Unsigned 32 bit integer -cdef class Criterion: - # The criterion computes the impurity of a node and the reduction of - # impurity of a split on that node. It also computes the output statistics - # such as the mean in regression and class probabilities in classification. + +cdef class BaseCriterion: + """Abstract interface for criterion.""" # Internal structures - cdef const DOUBLE_t[:, ::1] y # Values of y cdef const DOUBLE_t[:] sample_weight # Sample weights cdef const SIZE_t[:] sample_indices # Sample indices in X, y @@ -37,19 +37,7 @@ cdef class Criterion: cdef double weighted_n_left # Weighted number of samples in the left node cdef double weighted_n_right # Weighted number of samples in the right node - # The criterion object is maintained such that left and right collected - # statistics correspond to samples[start:pos] and samples[pos:end]. - - # Methods - cdef int init( - self, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, - double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end - ) except -1 nogil + # Core methods that criterion class _must_ implement. cdef int reset(self) except -1 nogil cdef int reverse_reset(self) except -1 nogil cdef int update(self, SIZE_t new_pos) except -1 nogil @@ -71,6 +59,25 @@ cdef class Criterion: ) noexcept nogil cdef double proxy_impurity_improvement(self) noexcept nogil + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil + +cdef class Criterion(BaseCriterion): + """Abstract interface for supervised impurity criteria.""" + + cdef const DOUBLE_t[:, ::1] y + + cdef int init( + self, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + double weighted_n_samples, + const SIZE_t[:] sample_indices + ) except -1 nogil + cdef class ClassificationCriterion(Criterion): """Abstract criterion for classification.""" @@ -88,4 +95,4 @@ cdef class RegressionCriterion(Criterion): cdef double[::1] sum_total # The sum of w*y. cdef double[::1] sum_left # Same as above, but for the left side of the split - cdef double[::1] sum_right # Same as above, but for the right side of the split + cdef double[::1] sum_right # Same as above, but for the right side of the split \ No newline at end of file diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 7cd7bbb0e3c1b..c94914daa0e0b 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -9,6 +9,8 @@ # Fares Hedayati # Jacob Schreiber # Nelson Liu +# Adam Li +# Jong Shin # # License: BSD 3 clause @@ -29,11 +31,20 @@ from ._utils cimport WeightedMedianCalculator # EPSILON is used in the Poisson criterion cdef double EPSILON = 10 * np.finfo('double').eps -cdef class Criterion: - """Interface for impurity criteria. - +cdef class BaseCriterion: + """This is an abstract interface for criterion. For example, a tree model could + be either supervisedly, or unsupervisedly computing impurity on samples of + covariates, or labels, or both. Although scikit-learn currently only contains + supervised tree methods, this class enables 3rd party packages to leverage + scikit-learn's Cython code for criteria. + The downstream classes _must_ implement methods to compute the impurity + in current node and in children nodes. This object stores methods on how to calculate how good a split is using - different metrics. + a set API. + Samples in the "current" node are stored in `samples[start:end]` which is + partitioned around `pos` (an index in `start:end`) so that: + - the samples of left child node are stored in `samples[start:pos]` + - the samples of right child node are stored in `samples[pos:end]` """ def __getstate__(self): return {} @@ -41,61 +52,23 @@ cdef class Criterion: def __setstate__(self, d): pass - cdef int init( - self, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, - double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, - ) except -1 nogil: - """Placeholder for a method which will initialize the criterion. - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - - Parameters - ---------- - y : ndarray, dtype=DOUBLE_t - y is a buffer that can store values for n_outputs target variables - stored as a Cython memoryview. - sample_weight : ndarray, dtype=DOUBLE_t - The weight of each sample stored as a Cython memoryview. - weighted_n_samples : double - The total weight of the samples being considered - sample_indices : ndarray, dtype=SIZE_t - A mask on the samples. Indices of the samples in X and y we want to use, - where sample_indices[start:end] correspond to the samples in this node. - start : SIZE_t - The first sample to be used on this node - end : SIZE_t - The last sample used on this node - - """ - pass - cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. - This method must be implemented by the subclass. """ pass cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. - This method must be implemented by the subclass. """ pass cdef int update(self, SIZE_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. - This updates the collected statistics by moving sample_indices[pos:new_pos] from the right child to the left child. It must be implemented by the subclass. - Parameters ---------- new_pos : SIZE_t @@ -105,7 +78,6 @@ cdef class Criterion: cdef double node_impurity(self) noexcept nogil: """Placeholder for calculating the impurity of the node. - Placeholder for a method which will evaluate the impurity of the current node, i.e. the impurity of sample_indices[start:end]. This is the primary function of the criterion class. The smaller the impurity the @@ -116,11 +88,9 @@ cdef class Criterion: cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Placeholder for calculating the impurity of children. - Placeholder for a method which evaluates the impurity in children nodes, i.e. the impurity of sample_indices[start:pos] + the impurity of sample_indices[pos:end]. - Parameters ---------- impurity_left : double pointer @@ -134,10 +104,8 @@ cdef class Criterion: cdef void node_value(self, double* dest) noexcept nogil: """Placeholder for storing the node value. - Placeholder for a method which will compute the node value of sample_indices[start:end] and save the value into dest. - Parameters ---------- dest : double pointer @@ -147,12 +115,10 @@ cdef class Criterion: cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. - This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. - The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ @@ -167,28 +133,21 @@ cdef class Criterion: double impurity_left, double impurity_right) noexcept nogil: """Compute the improvement in impurity. - This method computes the improvement in impurity when a split occurs. The weighted impurity improvement equation is the following: - N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) - where N is the total number of samples, N_t is the number of samples at the current node, N_t_L is the number of samples in the left child, and N_t_R is the number of samples in the right child, - Parameters ---------- impurity_parent : double The initial impurity of the parent node before the split - impurity_left : double The impurity of the left child - impurity_right : double The impurity of the right child - Return ------ double : improvement in impurity after the split occurs @@ -199,6 +158,61 @@ cdef class Criterion: - (self.weighted_n_left / self.weighted_n_node_samples * impurity_left))) + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Abstract method which will set sample pointers in the criterion. + The dataset array that we compute criteria on is assumed to consist of 'N' + ordered samples or rows (i.e. sorted). Since we pass this by reference, we + use sample pointers to move the start and end around to consider only a subset of data. + This function should also update relevant statistics that the class uses to compute the final criterion. + Parameters + ---------- + start : SIZE_t + The index of the first sample to be used on computation of criteria of the current node. + end : SIZE_t + The last sample used on this node + """ + pass + + +cdef class Criterion(BaseCriterion): + """Interface for impurity criteria. + The supervised criterion computes the impurity of a node and the reduction of + impurity of a split on that node using the distribution of labels in parent and + children nodes. It also computes the output statistics + such as the mean in regression and class probabilities in classification. + Instances of this class are responsible for compute splits' impurity difference + Criterion is the base class for criteria used in supervised tree-based models + with a homogeneous float64-dtyped y. + """ + cdef int init( + self, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + double weighted_n_samples, + const SIZE_t[:] sample_indices + ) except -1 nogil: + """Placeholder for a method which will initialize the criterion. + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + Parameters + ---------- + y : ndarray, dtype=DOUBLE_t + y is a buffer that can store values for n_outputs target variables + stored as a Cython memoryview. + sample_weight : ndarray, dtype=DOUBLE_t + The weight of each sample stored as a Cython memoryview. + weighted_n_samples : double + The total weight of the samples being considered + sample_indices : ndarray, dtype=SIZE_t + A mask on the samples. Indices of the samples in X and y we want to use, + where sample_indices[start:end] correspond to the samples in this node. + """ + pass + cdef class ClassificationCriterion(Criterion): """Abstract criterion for classification.""" @@ -206,7 +220,6 @@ cdef class ClassificationCriterion(Criterion): def __cinit__(self, SIZE_t n_outputs, cnp.ndarray[SIZE_t, ndim=1] n_classes): """Initialize attributes for this criterion. - Parameters ---------- n_outputs : SIZE_t @@ -254,18 +267,11 @@ cdef class ClassificationCriterion(Criterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end + const SIZE_t[:] sample_indices ) except -1 nogil: """Initialize the criterion. - - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. - Parameters ---------- y : ndarray, dtype=DOUBLE_t @@ -277,18 +283,24 @@ cdef class ClassificationCriterion(Criterion): sample_indices : ndarray, dtype=SIZE_t A mask on the samples. Indices of the samples in X and y we want to use, where sample_indices[start:end] correspond to the samples in this node. - start : SIZE_t - The first sample to use in the mask - end : SIZE_t - The last sample to use in the mask """ self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" + self.n_node_samples = end - start self.start = start self.end = end - self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples + self.weighted_n_node_samples = 0.0 cdef SIZE_t i @@ -301,12 +313,12 @@ cdef class ClassificationCriterion(Criterion): memset(&self.sum_total[k, 0], 0, self.n_classes[k] * sizeof(double)) for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] # w is originally set to be 1.0, meaning that if no sample weights # are given, the default weight of each sample is 1.0. - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] # Count weighted class frequency for each target for k in range(self.n_outputs): @@ -317,11 +329,9 @@ cdef class ClassificationCriterion(Criterion): # Reset to pos=start self.reset() - return 0 cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -338,7 +348,6 @@ cdef class ClassificationCriterion(Criterion): cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -355,10 +364,8 @@ cdef class ClassificationCriterion(Criterion): cdef int update(self, SIZE_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. - Parameters ---------- new_pos : SIZE_t @@ -428,7 +435,6 @@ cdef class ClassificationCriterion(Criterion): cdef void node_value(self, double* dest) noexcept nogil: """Compute the node value of sample_indices[start:end] and save it into dest. - Parameters ---------- dest : double pointer @@ -443,23 +449,17 @@ cdef class ClassificationCriterion(Criterion): cdef class Entropy(ClassificationCriterion): r"""Cross Entropy impurity criterion. - This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let - count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k) - be the proportion of class k observations in node m. - The cross-entropy is then defined as - cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k) """ cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. - Evaluate the cross-entropy criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -481,10 +481,8 @@ cdef class Entropy(ClassificationCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). - Parameters ---------- impurity_left : double pointer @@ -516,24 +514,18 @@ cdef class Entropy(ClassificationCriterion): cdef class Gini(ClassificationCriterion): r"""Gini Index impurity criterion. - This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let - count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k) - be the proportion of class k observations in node m. - The Gini Index is then defined as: - index = \sum_{k=0}^{K-1} count_k (1 - count_k) = 1 - \sum_{k=0}^{K-1} count_k ** 2 """ cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. - Evaluate the Gini criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -559,10 +551,8 @@ cdef class Gini(ClassificationCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]) using the Gini index. - Parameters ---------- impurity_left : double pointer @@ -601,24 +591,20 @@ cdef class Gini(ClassificationCriterion): cdef class RegressionCriterion(Criterion): r"""Abstract regression criterion. - This handles cases where the target is a continuous value, and is evaluated by computing the variance of the target values left and right of the split point. The computation takes linear time with `n_samples` by using :: - var = \sum_i^n (y_i - y_bar) ** 2 = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2 """ def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): """Initialize parameters for this criterion. - Parameters ---------- n_outputs : SIZE_t The number of targets to be predicted - n_samples : SIZE_t The total number of samples to fit on """ @@ -648,23 +634,29 @@ cdef class RegressionCriterion(Criterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, + const SIZE_t[:] sample_indices ) except -1 nogil: - """Initialize the criterion. - - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - """ + """Initialize the criterion.""" # Initialize fields self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" self.start = start self.end = end + self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples + + self.sq_sum_total = 0.0 self.weighted_n_node_samples = 0. cdef SIZE_t i @@ -673,14 +665,14 @@ cdef class RegressionCriterion(Criterion): cdef DOUBLE_t y_ik cdef DOUBLE_t w_y_ik cdef DOUBLE_t w = 1.0 - self.sq_sum_total = 0.0 + memset(&self.sum_total[0], 0, self.n_outputs * sizeof(double)) for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] @@ -692,7 +684,6 @@ cdef class RegressionCriterion(Criterion): # Reset to pos=start self.reset() - return 0 cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start.""" @@ -785,13 +776,11 @@ cdef class RegressionCriterion(Criterion): cdef class MSE(RegressionCriterion): """Mean squared error impurity criterion. - MSE = var_left + var_right """ cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. - Evaluate the MSE criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -807,22 +796,16 @@ cdef class MSE(RegressionCriterion): cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. - This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. - The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. - The MSE proxy is derived from - sum_{i left}(y_i - y_pred_L)^2 + sum_{i right}(y_i - y_pred_R)^2 = sum(y_i^2) - n_L * mean_{i left}(y_i)^2 - n_R * mean_{i right}(y_i)^2 - Neglecting constant terms, this gives: - - 1/n_L * sum_{i left}(y_i)^2 - 1/n_R * sum_{i right}(y_i)^2 """ cdef SIZE_t k @@ -839,7 +822,6 @@ cdef class MSE(RegressionCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). """ @@ -883,7 +865,6 @@ cdef class MSE(RegressionCriterion): cdef class MAE(RegressionCriterion): r"""Mean absolute error impurity criterion. - MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true value and f_i is the predicted value.""" @@ -895,12 +876,10 @@ cdef class MAE(RegressionCriterion): def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): """Initialize parameters for this criterion. - Parameters ---------- n_outputs : SIZE_t The number of targets to be predicted - n_samples : SIZE_t The total number of samples to fit on """ @@ -933,26 +912,30 @@ cdef class MAE(RegressionCriterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, + const SIZE_t[:] sample_indices ) except -1 nogil: - """Initialize the criterion. - - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - """ - cdef SIZE_t i, p, k - cdef DOUBLE_t w = 1.0 - + """Initialize the criterion.""" # Initialize fields self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" + cdef SIZE_t i, p, k + cdef DOUBLE_t w = 1.0 + self.start = start self.end = end + self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples self.weighted_n_node_samples = 0. cdef void** left_child = self.left_child_ptr @@ -963,10 +946,10 @@ cdef class MAE(RegressionCriterion): ( right_child[k]).reset() for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] for k in range(self.n_outputs): # push method ends up calling safe_realloc, hence `except -1` @@ -981,11 +964,9 @@ cdef class MAE(RegressionCriterion): # Reset to pos=start self.reset() - return 0 cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -1016,7 +997,6 @@ cdef class MAE(RegressionCriterion): cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -1044,7 +1024,6 @@ cdef class MAE(RegressionCriterion): cdef int update(self, SIZE_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -1107,7 +1086,6 @@ cdef class MAE(RegressionCriterion): cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. - Evaluate the MAE criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -1132,7 +1110,6 @@ cdef class MAE(RegressionCriterion): cdef void children_impurity(self, double* p_impurity_left, double* p_impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). """ @@ -1179,21 +1156,17 @@ cdef class MAE(RegressionCriterion): cdef class FriedmanMSE(MSE): """Mean squared error impurity criterion with improvement score by Friedman. - Uses the formula (35) in Friedman's original Gradient Boosting paper: - diff = mean_left - mean_right improvement = n_left * n_right * diff^2 / (n_left + n_right) """ cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. - This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. - The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ @@ -1234,9 +1207,7 @@ cdef class FriedmanMSE(MSE): cdef class Poisson(RegressionCriterion): """Half Poisson deviance as impurity criterion. - Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true) - Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)` at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the implemented impurity (factor 2 is skipped): @@ -1255,7 +1226,6 @@ cdef class Poisson(RegressionCriterion): cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. - Evaluate the Poisson criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -1265,24 +1235,18 @@ cdef class Poisson(RegressionCriterion): cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. - This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. - The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. - The Poisson proxy is derived from: - sum_{i left }(y_i * log(y_i / y_pred_L)) + sum_{i right}(y_i * log(y_i / y_pred_R)) = sum(y_i * log(y_i) - n_L * mean_{i left}(y_i) * log(mean_{i left}(y_i)) - n_R * mean_{i right}(y_i) * log(mean_{i right}(y_i)) - Neglecting constant terms, this gives - - sum{i left }(y_i) * log(mean{i left}(y_i)) - sum{i right}(y_i) * log(mean{i right}(y_i)) """ @@ -1312,7 +1276,6 @@ cdef class Poisson(RegressionCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity of the right child (sample_indices[pos:end]) for Poisson. """ diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 13fec5974c3c5..b0207ab0a715d 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -4,12 +4,14 @@ # Joel Nothman # Arnaud Joly # Jacob Schreiber +# Adam Li +# Jong Shin # # License: BSD 3 clause # See _splitter.pyx for details. -from ._criterion cimport Criterion +from ._criterion cimport BaseCriterion, Criterion from ._tree cimport DTYPE_t # Type of X from ._tree cimport DOUBLE_t # Type of y, sample_weight @@ -28,14 +30,15 @@ cdef struct SplitRecord: double impurity_left # Impurity of the left split. double impurity_right # Impurity of the right split. -cdef class Splitter: +cdef class BaseSplitter: + """Abstract interface for splitter.""" + # The splitter searches in the input space for a feature and a threshold # to split the samples samples[start:end]. # # The impurity computations are delegated to a criterion object. # Internal structures - cdef public Criterion criterion # Impurity criterion cdef public SIZE_t max_features # Number of features to test cdef public SIZE_t min_samples_leaf # Min samples in a leaf cdef public double min_weight_leaf # Minimum weight in a leaf @@ -54,7 +57,6 @@ cdef class Splitter: cdef SIZE_t start # Start position for the current node cdef SIZE_t end # End position for the current node - cdef const DOUBLE_t[:, ::1] y cdef const DOUBLE_t[:] sample_weight # The samples vector `samples` is maintained by the Splitter object such @@ -74,27 +76,38 @@ cdef class Splitter: # This allows optimization with depth-based tree building. # Methods - cdef int init( - self, - object X, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight - ) except -1 - cdef int node_reset( self, SIZE_t start, SIZE_t end, double* weighted_n_node_samples ) except -1 nogil - cdef int node_split( self, double impurity, # Impurity of the node SplitRecord* split, SIZE_t* n_constant_features ) except -1 nogil - cdef void node_value(self, double* dest) noexcept nogil - cdef double node_impurity(self) noexcept nogil + cdef int pointer_size(self) noexcept nogil + +cdef class Splitter(BaseSplitter): + cdef public Criterion criterion # Impurity criterion + cdef const DOUBLE_t[:, ::1] y + + cdef int init( + self, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight + ) except -1 + + # Methods that allow modifications to stopping conditions + cdef bint check_presplit_conditions( + self, + SplitRecord current_split, + ) noexcept nogil + cdef bint check_postsplit_conditions( + self + ) noexcept nogil diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 83a80d90cc1b9..17a747433d1a8 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -8,7 +8,10 @@ # Joel Nothman # Fares Hedayati # Jacob Schreiber +# Adam Li +# Jong Shin # + # License: BSD 3 clause from ._criterion cimport Criterion @@ -43,16 +46,78 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) noexcept nogil self.threshold = 0. self.improvement = -INFINITY -cdef class Splitter: - """Abstract splitter class. +cdef class BaseSplitter: + """This is an abstract interface for splitters. + + For example, a tree model could be either supervisedly, or unsupervisedly computing splits on samples of + covariates, labels, or both. Although scikit-learn currently only contains + supervised tree methods, this class enables 3rd party packages to leverage + scikit-learn's Cython code for splitting. + + A splitter is usually used in conjunction with a criterion class, which explicitly handles + computing the criteria, which we split on. The setting of that criterion class is handled + by downstream classes. - Splitters are called by tree builders to find the best splits on both - sparse and dense data, one split at a time. + The downstream classes _must_ implement methods to compute the split in a node. """ + def __getstate__(self): + return {} + + def __setstate__(self, d): + pass + + cdef int node_reset(self, SIZE_t start, SIZE_t end, + double* weighted_n_node_samples) except -1 nogil: + """Reset splitter on node samples[start:end]. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + + Parameters + ---------- + start : SIZE_t + The index of the first sample to consider + end : SIZE_t + The index of the last sample to consider + weighted_n_node_samples : ndarray, dtype=double pointer + The total weight of those samples + """ + pass + + cdef int node_split(self, double impurity, SplitRecord* split, + SIZE_t* n_constant_features) except -1 nogil: + """Find the best split on node samples[start:end]. + + This is a placeholder method. The majority of computation will be done + here. + + It should return -1 upon errors. + """ + pass + + cdef void node_value(self, double* dest) noexcept nogil: + """Copy the value of node samples[start:end] into dest.""" + pass + + cdef double node_impurity(self) noexcept nogil: + """Return the impurity of the current node.""" + pass + + cdef int pointer_size(self) noexcept nogil: + """Size of the pointer for split records. + + Overriding this function allows one to use different subclasses of + `SplitRecord`. + """ + return sizeof(SplitRecord) + +cdef class Splitter(BaseSplitter): + """Abstract interface for supervised splitters.""" + def __cinit__(self, Criterion criterion, SIZE_t max_features, SIZE_t min_samples_leaf, double min_weight_leaf, - object random_state): + object random_state, *argv): """ Parameters ---------- @@ -75,7 +140,6 @@ cdef class Splitter: random_state : object The user inputted random state to be used for pseudo-randomness """ - self.criterion = criterion self.n_samples = 0 @@ -86,11 +150,6 @@ cdef class Splitter: self.min_weight_leaf = min_weight_leaf self.random_state = random_state - def __getstate__(self): - return {} - - def __setstate__(self, d): - pass def __reduce__(self): return (type(self), (self.criterion, @@ -127,7 +186,6 @@ cdef class Splitter: are assumed to have uniform weight. This is represented as a Cython memoryview. """ - self.rand_r_state = self.random_state.randint(0, RAND_R_MAX) cdef SIZE_t n_samples = X.shape[0] @@ -165,6 +223,19 @@ cdef class Splitter: self.y = y self.sample_weight = sample_weight + + self.criterion.init( + self.y, + self.sample_weight, + self.weighted_n_samples, + self.samples + ) + + self.criterion.set_sample_pointers( + self.start, + self.end + ) + return 0 cdef int node_reset(self, SIZE_t start, SIZE_t end, @@ -187,30 +258,11 @@ cdef class Splitter: self.start = start self.end = end - self.criterion.init( - self.y, - self.sample_weight, - self.weighted_n_samples, - self.samples, - start, - end - ) + self.criterion.set_sample_pointers(start, end) weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples return 0 - cdef int node_split(self, double impurity, SplitRecord* split, - SIZE_t* n_constant_features) except -1 nogil: - """Find the best split on node samples[start:end]. - - This is a placeholder method. The majority of computation will be done - here. - - It should return -1 upon errors. - """ - - pass - cdef void node_value(self, double* dest) noexcept nogil: """Copy the value of node samples[start:end] into dest.""" @@ -221,6 +273,41 @@ cdef class Splitter: return self.criterion.node_impurity() + cdef bint check_presplit_conditions( + self, + SplitRecord current_split, + ) noexcept nogil: + """Check stopping conditions pre-split. + + This is typically a metric that is cheaply computed given the + current proposed split, which is stored as a the `current_split` + argument. + """ + cdef SIZE_t min_samples_leaf = self.min_samples_leaf + + if (((current_split.pos - self.start) < min_samples_leaf) or + ((self.end - current_split.pos) < min_samples_leaf)): + return 1 + + return 0 + + cdef bint check_postsplit_conditions( + self + ) noexcept nogil: + """Check stopping conditions after evaluating the split. + + This takes some metric that is stored in the Criterion + object and checks against internal stop metrics. + """ + cdef double min_weight_leaf = self.min_weight_leaf + + # Reject if min_weight_leaf is not satisfied + if ((self.criterion.weighted_n_left < min_weight_leaf) or + (self.criterion.weighted_n_right < min_weight_leaf)): + return 1 + + return 0 + # Introduce a fused-class to make it possible to share the split implementation # between the dense and sparse cases in the node_split_best and node_split_random # functions. The alternative would have been to use inheritance-based polymorphism @@ -229,7 +316,7 @@ cdef class Splitter: ctypedef fused Partitioner: DensePartitioner SparsePartitioner - + cdef inline int node_split_best( Splitter splitter, Partitioner partitioner, @@ -349,15 +436,13 @@ cdef inline int node_split_best( current_split.pos = p # Reject if min_samples_leaf is not guaranteed - if (((current_split.pos - start) < min_samples_leaf) or - ((end - current_split.pos) < min_samples_leaf)): + if splitter.check_presplit_conditions(current_split) == 1: continue criterion.update(current_split.pos) # Reject if min_weight_leaf is not satisfied - if ((criterion.weighted_n_left < min_weight_leaf) or - (criterion.weighted_n_right < min_weight_leaf)): + if splitter.check_postsplit_conditions() == 1: continue current_proxy_improvement = criterion.proxy_impurity_improvement() @@ -645,8 +730,7 @@ cdef inline int node_split_random( current_split.pos = partitioner.partition_samples(current_split.threshold) # Reject if min_samples_leaf is not guaranteed - if (((current_split.pos - start) < min_samples_leaf) or - ((end - current_split.pos) < min_samples_leaf)): + if splitter.check_presplit_conditions(current_split) == 1: continue # Evaluate split @@ -656,8 +740,7 @@ cdef inline int node_split_random( criterion.update(current_split.pos) # Reject if min_weight_leaf is not satisfied - if ((criterion.weighted_n_left < min_weight_leaf) or - (criterion.weighted_n_right < min_weight_leaf)): + if splitter.check_postsplit_conditions() == 1: continue current_proxy_improvement = criterion.proxy_impurity_improvement() diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 1966651d8c89a..8140733a9fc26 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -13,6 +13,8 @@ import numpy as np cimport numpy as cnp +from libcpp.vector cimport vector + ctypedef cnp.npy_float32 DTYPE_t # Type of X ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight ctypedef cnp.npy_intp SIZE_t # Type for indices and counters @@ -33,40 +35,32 @@ cdef struct Node: SIZE_t n_node_samples # Number of samples at the node DOUBLE_t weighted_n_node_samples # Weighted number of samples at the node - -cdef class Tree: - # The Tree object is a binary tree structure constructed by the - # TreeBuilder. The tree structure is used for predictions and - # feature importances. - - # Input/Output layout - cdef public SIZE_t n_features # Number of features in X - cdef SIZE_t* n_classes # Number of classes in y[:, k] - cdef public SIZE_t n_outputs # Number of outputs in y - cdef public SIZE_t max_n_classes # max(n_classes) - +cdef class BaseTree: # Inner structures: values are stored separately from node structure, # since size is determined at runtime. cdef public SIZE_t max_depth # Max depth of the tree cdef public SIZE_t node_count # Counter for node IDs cdef public SIZE_t capacity # Capacity of tree, in terms of nodes cdef Node* nodes # Array of nodes - cdef double* value # (capacity, n_outputs, max_n_classes) array of values - cdef SIZE_t value_stride # = n_outputs * max_n_classes - # Methods - cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, - SIZE_t feature, double threshold, double impurity, - SIZE_t n_node_samples, - double weighted_n_node_samples) except -1 nogil + cdef SIZE_t value_stride # The dimensionality of a vectorized output per sample + cdef double* value # Array of values prediction values for each node + + # Generic Methods: These are generic methods used by any tree. cdef int _resize(self, SIZE_t capacity) except -1 nogil cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil - - cdef cnp.ndarray _get_value_ndarray(self) - cdef cnp.ndarray _get_node_ndarray(self) - - cpdef cnp.ndarray predict(self, object X) - + cdef SIZE_t _add_node( + self, + SIZE_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples + ) except -1 nogil + + # Python API methods: These are methods exposed to Python cpdef cnp.ndarray apply(self, object X) cdef cnp.ndarray _apply_dense(self, object X) cdef cnp.ndarray _apply_sparse_csr(self, object X) @@ -78,6 +72,49 @@ cdef class Tree: cpdef compute_node_depths(self) cpdef compute_feature_importances(self, normalize=*) + # Abstract methods: these functions must be implemented by any decision tree + cdef int _set_split_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil + cdef int _set_leaf_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil + cdef DTYPE_t _compute_feature( + self, + const DTYPE_t[:, :] X_ndarray, + SIZE_t sample_index, + Node *node + ) noexcept nogil + cdef void _compute_feature_importances( + self, + cnp.float64_t[:] importances, + Node* node, + ) noexcept nogil + +cdef class Tree(BaseTree): + # The Supervised Tree object is a binary tree structure constructed by the + # TreeBuilder. The tree structure is used for predictions and + # feature importances. + # + # Value of upstream properties: + # - value_stride = n_outputs * max_n_classes + # - value = (capacity, n_outputs, max_n_classes) array of values + + # Input/Output layout for supervised tree + cdef public SIZE_t n_features # Number of features in X + cdef SIZE_t* n_classes # Number of classes in y[:, k] + cdef public SIZE_t n_outputs # Number of outputs in y + cdef public SIZE_t max_n_classes # max(n_classes) + + # Methods + cdef cnp.ndarray _get_value_ndarray(self) + cdef cnp.ndarray _get_node_ndarray(self) + + cpdef cnp.ndarray predict(self, object X) # ============================================================================= # Tree builder @@ -91,8 +128,7 @@ cdef class TreeBuilder: # This class controls the various stopping criteria and the node splitting # evaluation order, e.g. depth-first or best-first. - cdef Splitter splitter # Splitting algorithm - + cdef Splitter splitter cdef SIZE_t min_samples_split # Minimum number of samples in an internal node cdef SIZE_t min_samples_leaf # Minimum number of samples in a leaf cdef double min_weight_leaf # Minimum weight in a leaf diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 75eed058bfd4e..e5b759aee23df 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -22,6 +22,8 @@ from libcpp.vector cimport vector from libcpp.algorithm cimport pop_heap from libcpp.algorithm cimport push_heap from libcpp cimport bool +from cython.operator cimport dereference as deref +from libc.stdlib cimport malloc, free import struct @@ -83,6 +85,7 @@ NODE_DTYPE = np.asarray((&dummy)).dtype # TreeBuilder # ============================================================================= + cdef class TreeBuilder: """Interface for different tree building strategies.""" @@ -196,9 +199,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef bint is_left cdef SIZE_t n_node_samples = splitter.n_samples cdef double weighted_n_node_samples - cdef SplitRecord split cdef SIZE_t node_id + cdef SplitRecord split + cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) + cdef double impurity = INFINITY cdef SIZE_t n_constant_features cdef bint is_leaf @@ -248,7 +253,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): is_leaf = is_leaf or impurity <= EPSILON if not is_leaf: - splitter.node_split(impurity, &split, &n_constant_features) + splitter.node_split(impurity, split_ptr, &n_constant_features) + + # assign local copy of SplitRecord to assign + # pos, improvement, and impurity scores + split = deref(split_ptr) + # If EPSILON=0 in the below comparison, float precision # issues stop splitting, producing trees that are # dissimilar to v0.18 @@ -256,8 +266,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): (split.improvement + EPSILON < min_impurity_decrease)) - node_id = tree._add_node(parent, is_left, is_leaf, split.feature, - split.threshold, impurity, n_node_samples, + node_id = tree._add_node(parent, is_left, is_leaf, split_ptr, + impurity, n_node_samples, weighted_n_node_samples) if node_id == INTPTR_MAX: @@ -297,6 +307,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if rc >= 0: tree.max_depth = max_depth_seen + + # free the memory created for the SplitRecord pointer + free(split_ptr) + if rc == -1: raise MemoryError() @@ -462,6 +476,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): FrontierRecord* res) except -1 nogil: """Adds node w/ partition ``[start, end)`` to the frontier. """ cdef SplitRecord split + cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) + cdef SIZE_t node_id cdef SIZE_t n_node_samples cdef SIZE_t n_constant_features = 0 @@ -483,7 +499,11 @@ cdef class BestFirstTreeBuilder(TreeBuilder): ) if not is_leaf: - splitter.node_split(impurity, &split, &n_constant_features) + splitter.node_split(impurity, split_ptr, &n_constant_features) + # assign local copy of SplitRecord to assign + # pos, improvement, and impurity scores + split = deref(split_ptr) + # If EPSILON=0 in the below comparison, float precision issues stop # splitting early, producing trees that are dissimilar to v0.18 is_leaf = (is_leaf or split.pos >= end or @@ -493,7 +513,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if parent != NULL else _TREE_UNDEFINED, is_left, is_leaf, - split.feature, split.threshold, impurity, n_node_samples, + split_ptr, impurity, n_node_samples, weighted_n_node_samples) if node_id == INTPTR_MAX: return -1 @@ -522,7 +542,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): res.improvement = 0.0 res.impurity_left = impurity res.impurity_right = impurity - + + free(split_ptr) return 0 @@ -530,190 +551,15 @@ cdef class BestFirstTreeBuilder(TreeBuilder): # Tree # ============================================================================= -cdef class Tree: - """Array-based representation of a binary decision tree. - - The binary tree is represented as a number of parallel arrays. The i-th - element of each array holds information about the node `i`. Node 0 is the - tree's root. You can find a detailed description of all arrays in - `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split - nodes, resp. In this case the values of nodes of the other type are - arbitrary! - - Attributes - ---------- - node_count : int - The number of nodes (internal nodes + leaves) in the tree. - - capacity : int - The current capacity (i.e., size) of the arrays, which is at least as - great as `node_count`. - - max_depth : int - The depth of the tree, i.e. the maximum depth of its leaves. - - children_left : array of int, shape [node_count] - children_left[i] holds the node id of the left child of node i. - For leaves, children_left[i] == TREE_LEAF. Otherwise, - children_left[i] > i. This child handles the case where - X[:, feature[i]] <= threshold[i]. - - children_right : array of int, shape [node_count] - children_right[i] holds the node id of the right child of node i. - For leaves, children_right[i] == TREE_LEAF. Otherwise, - children_right[i] > i. This child handles the case where - X[:, feature[i]] > threshold[i]. - - feature : array of int, shape [node_count] - feature[i] holds the feature to split on, for the internal node i. - - threshold : array of double, shape [node_count] - threshold[i] holds the threshold for the internal node i. - - value : array of double, shape [node_count, n_outputs, max_n_classes] - Contains the constant prediction value of each node. - - impurity : array of double, shape [node_count] - impurity[i] holds the impurity (i.e., the value of the splitting - criterion) at node i. - - n_node_samples : array of int, shape [node_count] - n_node_samples[i] holds the number of training samples reaching node i. - - weighted_n_node_samples : array of double, shape [node_count] - weighted_n_node_samples[i] holds the weighted number of training samples - reaching node i. +cdef class BaseTree: + """Base class for Cython tree models. + + Downstream classes must implement """ - # Wrap for outside world. - # WARNING: these reference the current `nodes` and `value` buffers, which - # must not be freed by a subsequent memory allocation. - # (i.e. through `_resize` or `__setstate__`) - property n_classes: - def __get__(self): - return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) - - property children_left: - def __get__(self): - return self._get_node_ndarray()['left_child'][:self.node_count] - - property children_right: - def __get__(self): - return self._get_node_ndarray()['right_child'][:self.node_count] - - property n_leaves: - def __get__(self): - return np.sum(np.logical_and( - self.children_left == -1, - self.children_right == -1)) - - property feature: - def __get__(self): - return self._get_node_ndarray()['feature'][:self.node_count] - - property threshold: - def __get__(self): - return self._get_node_ndarray()['threshold'][:self.node_count] - - property impurity: - def __get__(self): - return self._get_node_ndarray()['impurity'][:self.node_count] - - property n_node_samples: - def __get__(self): - return self._get_node_ndarray()['n_node_samples'][:self.node_count] - - property weighted_n_node_samples: - def __get__(self): - return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] - - property value: - def __get__(self): - return self._get_value_ndarray()[:self.node_count] - - # TODO: Convert n_classes to cython.integral memory view once - # https://github.com/cython/cython/issues/5243 is fixed - def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): - """Constructor.""" - cdef SIZE_t dummy = 0 - size_t_dtype = np.array(dummy).dtype - - n_classes = _check_n_classes(n_classes, size_t_dtype) - - # Input/Output layout - self.n_features = n_features - self.n_outputs = n_outputs - self.n_classes = NULL - safe_realloc(&self.n_classes, n_outputs) - - self.max_n_classes = np.max(n_classes) - self.value_stride = n_outputs * self.max_n_classes - - cdef SIZE_t k - for k in range(n_outputs): - self.n_classes[k] = n_classes[k] - - # Inner structures - self.max_depth = 0 - self.node_count = 0 - self.capacity = 0 - self.value = NULL - self.nodes = NULL - - def __dealloc__(self): - """Destructor.""" - # Free all inner structures - free(self.n_classes) - free(self.value) - free(self.nodes) - - def __reduce__(self): - """Reduce re-implementation, for pickling.""" - return (Tree, (self.n_features, - sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), - self.n_outputs), self.__getstate__()) - - def __getstate__(self): - """Getstate re-implementation, for pickling.""" - d = {} - # capacity is inferred during the __setstate__ using nodes - d["max_depth"] = self.max_depth - d["node_count"] = self.node_count - d["nodes"] = self._get_node_ndarray() - d["values"] = self._get_value_ndarray() - return d - - def __setstate__(self, d): - """Setstate re-implementation, for unpickling.""" - self.max_depth = d["max_depth"] - self.node_count = d["node_count"] - - if 'nodes' not in d: - raise ValueError('You have loaded Tree version which ' - 'cannot be imported') - - node_ndarray = d['nodes'] - value_ndarray = d['values'] - - value_shape = (node_ndarray.shape[0], self.n_outputs, - self.max_n_classes) - - node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE) - value_ndarray = _check_value_ndarray( - value_ndarray, - expected_dtype=np.dtype(np.float64), - expected_shape=value_shape - ) - - self.capacity = node_ndarray.shape[0] - if self._resize_c(self.capacity) != 0: - raise MemoryError("resizing tree to %d" % self.capacity) - - nodes = memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), - self.capacity * sizeof(Node)) - value = memcpy(self.value, cnp.PyArray_DATA(value_ndarray), - self.capacity * self.value_stride * sizeof(double)) - - cdef int _resize(self, SIZE_t capacity) except -1 nogil: + cdef int _resize( + self, + SIZE_t capacity + ) except -1 nogil: """Resize all inner arrays to `capacity`, if `capacity` == -1, then double the size of the inner arrays. @@ -725,7 +571,10 @@ cdef class Tree: with gil: raise MemoryError() - cdef int _resize_c(self, SIZE_t capacity=INTPTR_MAX) except -1 nogil: + cdef int _resize_c( + self, + SIZE_t capacity=INTPTR_MAX + ) except -1 nogil: """Guts of _resize Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -756,14 +605,87 @@ cdef class Tree: self.capacity = capacity return 0 - cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, - SIZE_t feature, double threshold, double impurity, - SIZE_t n_node_samples, - double weighted_n_node_samples) except -1 nogil: - """Add a node to the tree. + cdef int _set_split_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil: + """Set split node data. + + Parameters + ---------- + split_node : SplitRecord* + The pointer to the record of the split node data. + node : Node* + The pointer to the node that will hold the split node. + """ + # left_child and right_child will be set later for a split node + node.feature = split_node.feature + node.threshold = split_node.threshold + return 1 + cdef int _set_leaf_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil: + """Set leaf node data. + + Parameters + ---------- + split_node : SplitRecord* + The pointer to the record of the leaf node data. + node : Node* + The pointer to the node that will hold the leaf node. + """ + node.left_child = _TREE_LEAF + node.right_child = _TREE_LEAF + node.feature = _TREE_UNDEFINED + node.threshold = _TREE_UNDEFINED + return 1 + + cdef DTYPE_t _compute_feature(self, const DTYPE_t[:, :] X_ndarray, + SIZE_t sample_index, + Node *node) noexcept nogil: + """Compute feature from a given data matrix, X. + + In axis-aligned trees, this is simply the value in the column of X + for this specific feature. + """ + # the feature index + cdef DTYPE_t feature = X_ndarray[sample_index, node.feature] + return feature + + cdef SIZE_t _add_node( + self, + SIZE_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples + ) except -1 nogil: + """Add a node to the tree. The new node registers itself as the child of its parent. - + Parameters + ---------- + parent : SIZE_t + The index of the parent. If '_TREE_UNDEFINED', then the current + node is a root node. + is_left : bint + Whether or not the current node is to the left of the parent node. + is_leaf : bint + Whether or not the current node is a leaf node. + split_node : SplitRecord* + A pointer to a SplitRecord pointer address. + impurity : double + The impurity of the node to be added. + n_node_samples : SIZE_t + The number of samples in the node. + weighted_n_node_samples : double + The weight of the samples in the node. + Returns (size_t)(-1) on error. """ cdef SIZE_t node_id = self.node_count @@ -784,28 +706,18 @@ cdef class Tree: self.nodes[parent].right_child = node_id if is_leaf: - node.left_child = _TREE_LEAF - node.right_child = _TREE_LEAF - node.feature = _TREE_UNDEFINED - node.threshold = _TREE_UNDEFINED - + if self._set_leaf_node(split_node, node) != 1: + with gil: + raise RuntimeError else: - # left_child and right_child will be set later - node.feature = feature - node.threshold = threshold + if self._set_split_node(split_node, node) != 1: + with gil: + raise RuntimeError self.node_count += 1 return node_id - cpdef cnp.ndarray predict(self, object X): - """Predict target for X.""" - out = self._get_value_ndarray().take(self.apply(X), axis=0, - mode='clip') - if self.n_outputs == 1: - out = out.reshape(X.shape[0], self.max_n_classes) - return out - cpdef cnp.ndarray apply(self, object X): """Finds the terminal region (=leaf node) for each sample in X.""" if issparse(X): @@ -835,13 +747,20 @@ cdef class Tree: cdef Node* node = NULL cdef SIZE_t i = 0 + # the feature value + cdef DTYPE_t feature_value = 0 + with nogil: for i in range(n_samples): node = self.nodes + # While node not a leaf while node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - if X_ndarray[i, node.feature] <= node.threshold: + + # compute the feature value to compare against threshold + feature_value = self._compute_feature(X_ndarray, i, node) + if feature_value <= node.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -902,7 +821,6 @@ cdef class Tree: # ... and node.right_child != _TREE_LEAF: if feature_to_sample[node.feature] == i: feature_value = X_sample[node.feature] - else: feature_value = 0. @@ -951,6 +869,9 @@ cdef class Tree: cdef Node* node = NULL cdef SIZE_t i = 0 + # the feature index + cdef DOUBLE_t feature + with nogil: for i in range(n_samples): node = self.nodes @@ -962,7 +883,9 @@ cdef class Tree: indices[indptr[i + 1]] = (node - self.nodes) indptr[i + 1] += 1 - if X_ndarray[i, node.feature] <= node.threshold: + # compute the feature value to compare against threshold + feature = self._compute_feature(X_ndarray, i, node) + if feature <= node.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -1091,8 +1014,6 @@ cdef class Tree: cpdef compute_feature_importances(self, normalize=True): """Computes the importance of each feature (aka variable).""" - cdef Node* left - cdef Node* right cdef Node* nodes = self.nodes cdef Node* node = nodes cdef Node* end_node = node + self.node_count @@ -1105,13 +1026,9 @@ cdef class Tree: while node != end_node: if node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - left = &nodes[node.left_child] - right = &nodes[node.right_child] - - importances[node.feature] += ( - node.weighted_n_node_samples * node.impurity - - left.weighted_n_node_samples * left.impurity - - right.weighted_n_node_samples * right.impurity) + self._compute_feature_importances( + importances, node) + node += 1 for i in range(self.n_features): @@ -1127,44 +1044,27 @@ cdef class Tree: return np.asarray(importances) - cdef cnp.ndarray _get_value_ndarray(self): - """Wraps value as a 3-d NumPy array. - - The array keeps a reference to this Tree, which manages the underlying - memory. + cdef void _compute_feature_importances( + self, + cnp.float64_t[:] importances, + Node* node + ) noexcept nogil: + """Compute feature importances from a Node in the Tree. + + Wrapped in a private function to allow subclassing that + computes feature importances. """ - cdef cnp.npy_intp shape[3] - shape[0] = self.node_count - shape[1] = self.n_outputs - shape[2] = self.max_n_classes - cdef cnp.ndarray arr - arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) - Py_INCREF(self) - if PyArray_SetBaseObject(arr, self) < 0: - raise ValueError("Can't initialize array.") - return arr + cdef Node* nodes = self.nodes + cdef Node* left + cdef Node* right - cdef cnp.ndarray _get_node_ndarray(self): - """Wraps nodes as a NumPy struct array. + left = &nodes[node.left_child] + right = &nodes[node.right_child] - The array keeps a reference to this Tree, which manages the underlying - memory. Individual fields are publicly accessible as properties of the - Tree. - """ - cdef cnp.npy_intp shape[1] - shape[0] = self.node_count - cdef cnp.npy_intp strides[1] - strides[0] = sizeof(Node) - cdef cnp.ndarray arr - Py_INCREF(NODE_DTYPE) - arr = PyArray_NewFromDescr( cnp.ndarray, - NODE_DTYPE, 1, shape, - strides, self.nodes, - cnp.NPY_ARRAY_DEFAULT, None) - Py_INCREF(self) - if PyArray_SetBaseObject(arr, self) < 0: - raise ValueError("Can't initialize array.") - return arr + importances[node.feature] += ( + node.weighted_n_node_samples * node.impurity - + left.weighted_n_node_samples * left.impurity - + right.weighted_n_node_samples * right.impurity) def compute_partial_dependence(self, DTYPE_t[:, ::1] X, int[::1] target_features, @@ -1273,6 +1173,237 @@ cdef class Tree: total_weight) +cdef class Tree(BaseTree): + """Array-based representation of a binary decision tree. + + The binary tree is represented as a number of parallel arrays. The i-th + element of each array holds information about the node `i`. Node 0 is the + tree's root. You can find a detailed description of all arrays in + `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split + nodes, resp. In this case the values of nodes of the other type are + arbitrary! + + Attributes + ---------- + node_count : int + The number of nodes (internal nodes + leaves) in the tree. + + capacity : int + The current capacity (i.e., size) of the arrays, which is at least as + great as `node_count`. + + max_depth : int + The depth of the tree, i.e. the maximum depth of its leaves. + + children_left : array of int, shape [node_count] + children_left[i] holds the node id of the left child of node i. + For leaves, children_left[i] == TREE_LEAF. Otherwise, + children_left[i] > i. This child handles the case where + X[:, feature[i]] <= threshold[i]. + + children_right : array of int, shape [node_count] + children_right[i] holds the node id of the right child of node i. + For leaves, children_right[i] == TREE_LEAF. Otherwise, + children_right[i] > i. This child handles the case where + X[:, feature[i]] > threshold[i]. + + feature : array of int, shape [node_count] + feature[i] holds the feature to split on, for the internal node i. + + threshold : array of double, shape [node_count] + threshold[i] holds the threshold for the internal node i. + + value : array of double, shape [node_count, n_outputs, max_n_classes] + Contains the constant prediction value of each node. + + impurity : array of double, shape [node_count] + impurity[i] holds the impurity (i.e., the value of the splitting + criterion) at node i. + + n_node_samples : array of int, shape [node_count] + n_node_samples[i] holds the number of training samples reaching node i. + + weighted_n_node_samples : array of double, shape [node_count] + weighted_n_node_samples[i] holds the weighted number of training samples + reaching node i. + """ + # Wrap for outside world. + # WARNING: these reference the current `nodes` and `value` buffers, which + # must not be freed by a subsequent memory allocation. + # (i.e. through `_resize` or `__setstate__`) + property n_classes: + def __get__(self): + return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) + + property children_left: + def __get__(self): + return self._get_node_ndarray()['left_child'][:self.node_count] + + property children_right: + def __get__(self): + return self._get_node_ndarray()['right_child'][:self.node_count] + + property n_leaves: + def __get__(self): + return np.sum(np.logical_and( + self.children_left == -1, + self.children_right == -1)) + + property feature: + def __get__(self): + return self._get_node_ndarray()['feature'][:self.node_count] + + property threshold: + def __get__(self): + return self._get_node_ndarray()['threshold'][:self.node_count] + + property impurity: + def __get__(self): + return self._get_node_ndarray()['impurity'][:self.node_count] + + property n_node_samples: + def __get__(self): + return self._get_node_ndarray()['n_node_samples'][:self.node_count] + + property weighted_n_node_samples: + def __get__(self): + return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] + + property value: + def __get__(self): + return self._get_value_ndarray()[:self.node_count] + + # TODO: Convert n_classes to cython.integral memory view once + # https://github.com/cython/cython/issues/5243 is fixed + def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): + """Constructor.""" + cdef SIZE_t dummy = 0 + size_t_dtype = np.array(dummy).dtype + + n_classes = _check_n_classes(n_classes, size_t_dtype) + + # Input/Output layout + self.n_features = n_features + self.n_outputs = n_outputs + self.n_classes = NULL + safe_realloc(&self.n_classes, n_outputs) + + self.max_n_classes = np.max(n_classes) + self.value_stride = n_outputs * self.max_n_classes + + cdef SIZE_t k + for k in range(n_outputs): + self.n_classes[k] = n_classes[k] + + # Inner structures + self.max_depth = 0 + self.node_count = 0 + self.capacity = 0 + self.value = NULL + self.nodes = NULL + + def __dealloc__(self): + """Destructor.""" + # Free all inner structures + free(self.n_classes) + free(self.value) + free(self.nodes) + + def __reduce__(self): + """Reduce re-implementation, for pickling.""" + return (Tree, (self.n_features, + sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), + self.n_outputs), self.__getstate__()) + + def __getstate__(self): + """Getstate re-implementation, for pickling.""" + d = {} + # capacity is inferred during the __setstate__ using nodes + d["max_depth"] = self.max_depth + d["node_count"] = self.node_count + d["nodes"] = self._get_node_ndarray() + d["values"] = self._get_value_ndarray() + return d + + def __setstate__(self, d): + """Setstate re-implementation, for unpickling.""" + self.max_depth = d["max_depth"] + self.node_count = d["node_count"] + + if 'nodes' not in d: + raise ValueError('You have loaded Tree version which ' + 'cannot be imported') + + node_ndarray = d['nodes'] + value_ndarray = d['values'] + + value_shape = (node_ndarray.shape[0], self.n_outputs, + self.max_n_classes) + + node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE) + value_ndarray = _check_value_ndarray( + value_ndarray, + expected_dtype=np.dtype(np.float64), + expected_shape=value_shape + ) + + self.capacity = node_ndarray.shape[0] + if self._resize_c(self.capacity) != 0: + raise MemoryError("resizing tree to %d" % self.capacity) + + nodes = memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), + self.capacity * sizeof(Node)) + value = memcpy(self.value, cnp.PyArray_DATA(value_ndarray), + self.capacity * self.value_stride * sizeof(double)) + + cdef cnp.ndarray _get_value_ndarray(self): + """Wraps value as a 3-d NumPy array. + + The array keeps a reference to this Tree, which manages the underlying + memory. + """ + cdef cnp.npy_intp shape[3] + shape[0] = self.node_count + shape[1] = self.n_outputs + shape[2] = self.max_n_classes + cdef cnp.ndarray arr + arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) + Py_INCREF(self) + if PyArray_SetBaseObject(arr, self) < 0: + raise ValueError("Can't initialize array.") + return arr + + cdef cnp.ndarray _get_node_ndarray(self): + """Wraps nodes as a NumPy struct array. + + The array keeps a reference to this Tree, which manages the underlying + memory. Individual fields are publicly accessible as properties of the + Tree. + """ + cdef cnp.npy_intp shape[1] + shape[0] = self.node_count + cdef cnp.npy_intp strides[1] + strides[0] = sizeof(Node) + cdef cnp.ndarray arr + Py_INCREF(NODE_DTYPE) + arr = PyArray_NewFromDescr( cnp.ndarray, + NODE_DTYPE, 1, shape, + strides, self.nodes, + cnp.NPY_ARRAY_DEFAULT, None) + Py_INCREF(self) + if PyArray_SetBaseObject(arr, self) < 0: + raise ValueError("Can't initialize array.") + return arr + + cpdef cnp.ndarray predict(self, object X): + """Predict target for X.""" + out = self._get_value_ndarray().take(self.apply(X), axis=0, + mode='clip') + if self.n_outputs == 1: + out = out.reshape(X.shape[0], self.max_n_classes) + return out + + def _check_n_classes(n_classes, expected_dtype): if n_classes.ndim != 1: raise ValueError( @@ -1755,6 +1886,8 @@ cdef _build_pruned_tree( stack[BuildPrunedRecord] prune_stack BuildPrunedRecord stack_record + SplitRecord split + with nogil: # push root node onto stack prune_stack.push({"start": 0, "depth": 0, "parent": _TREE_UNDEFINED, "is_left": 0}) @@ -1771,8 +1904,12 @@ cdef _build_pruned_tree( is_leaf = leaves_in_subtree[orig_node_id] node = &orig_tree.nodes[orig_node_id] + # redefine to a SplitRecord to pass into _add_node + split.feature = node.feature + split.threshold = node.threshold + new_node_id = tree._add_node( - parent, is_left, is_leaf, node.feature, node.threshold, + parent, is_left, is_leaf, &split, node.impurity, node.n_node_samples, node.weighted_n_node_samples) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 1f3a9bf394b9b..69f948839259a 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -300,7 +300,7 @@ def test_xor(): clf.fit(X, y) assert clf.score(X, y) == 1.0, "Failed with {0}".format(name) - clf = Tree(random_state=0, max_features=1) + clf = Tree(random_state=0, max_features=X.shape[1]) clf.fit(X, y) assert clf.score(X, y) == 1.0, "Failed with {0}".format(name) @@ -440,7 +440,7 @@ def test_importances(): X, y = datasets.make_classification( n_samples=5000, n_features=10, - n_informative=3, + n_informative=4, n_redundant=0, n_repeated=0, shuffle=False, @@ -455,7 +455,7 @@ def test_importances(): n_important = np.sum(importances > 0.1) assert importances.shape[0] == 10, "Failed with {0}".format(name) - assert n_important == 3, "Failed with {0}".format(name) + assert n_important == 4, "Failed with {0}".format(name) # Check on iris that importances are the same for all builders clf = DecisionTreeClassifier(random_state=0) @@ -466,9 +466,9 @@ def test_importances(): assert_array_equal(clf.feature_importances_, clf2.feature_importances_) -def test_importances_raises(): +@pytest.mark.parametrize("clf", [DecisionTreeClassifier()]) +def test_importances_raises(clf): # Check if variable importance before fit raises ValueError. - clf = DecisionTreeClassifier() with pytest.raises(ValueError): getattr(clf, "feature_importances_") @@ -653,6 +653,7 @@ def test_min_samples_leaf(): est.fit(X, y) out = est.tree_.apply(X) node_counts = np.bincount(out) + # drop inner nodes leaf_count = node_counts[node_counts != 0] assert np.min(leaf_count) > 4, "Failed with {0}".format(name) @@ -677,7 +678,7 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False): else: X = DATASETS[datasets]["X"].astype(np.float32) y = DATASETS[datasets]["y"] - + rng = np.random.RandomState(42) weights = rng.rand(X.shape[0]) total_weight = np.sum(weights) @@ -828,7 +829,7 @@ def test_min_impurity_decrease(): ) # Check with a much lower value of 0.0001 est3 = TreeEstimator( - max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=0 + max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=1 ) # Check with a much lower value of 0.1 est4 = TreeEstimator( @@ -918,6 +919,7 @@ def test_pickle(): est2 = pickle.loads(serialized_object) assert type(est2) == est.__class__ + # score should match before/after pickling score2 = est2.score(X, y) assert ( score == score2 @@ -1031,7 +1033,6 @@ def test_memory_layout(): ALL_TREES.items(), [np.float64, np.float32] ): est = TreeEstimator(random_state=0) - # Nothing X = np.asarray(iris.data, dtype=dtype) y = iris.target @@ -1052,6 +1053,11 @@ def test_memory_layout(): y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) + # Strided + X = np.asarray(iris.data[::3], dtype=dtype) + y = iris.target[::3] + assert_array_equal(est.fit(X, y).predict(X), y) + # csr matrix X = csr_matrix(iris.data, dtype=dtype) y = iris.target @@ -1062,11 +1068,6 @@ def test_memory_layout(): y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) - # Strided - X = np.asarray(iris.data[::3], dtype=dtype) - y = iris.target[::3] - assert_array_equal(est.fit(X, y).predict(X), y) - def test_sample_weight(): # Check sample weighting. @@ -1260,7 +1261,7 @@ def test_behaviour_constant_feature_after_splits(): y = [0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3] for name, TreeEstimator in ALL_TREES.items(): # do not check extra random trees - if "ExtraTree" not in name: + if all(_name not in name for _name in ["ExtraTree"]): est = TreeEstimator(random_state=0, max_features=1) est.fit(X, y) assert est.tree_.max_depth == 2 @@ -1586,6 +1587,7 @@ def check_min_weight_leaf_split_level(name): sample_weight = [0.2, 0.2, 0.2, 0.2, 0.2] _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight) + # skip for sparse inputs _check_min_weight_leaf_split_level(TreeEstimator, csc_matrix(X), y, sample_weight) @@ -1644,6 +1646,7 @@ def check_decision_path(name): # Assert that leaves index are correct leaves = est.apply(X) leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)] + assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples)) # Ensure only one leave node per sample @@ -1930,6 +1933,7 @@ def assert_is_subtree(tree, subtree): def test_apply_path_readonly_all_trees(name, splitter, X_format): dataset = DATASETS["clf_small"] X_small = dataset["X"].astype(tree._tree.DTYPE, copy=False) + if X_format == "dense": X_readonly = create_memmap_backed_data(X_small) else: From 475bd05f779a4be4f301f751ac86ba6a998a219a Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 29 Mar 2023 09:41:10 -0700 Subject: [PATCH 02/39] Docs (#39) #### Reference Issues/PRs Fixes README and wheel building --------- Signed-off-by: Adam Li --- README.rst | 36 ++++++++++++--------- build_tools/azure/install.sh | 2 +- build_tools/github/repair_windows_wheels.sh | 2 +- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/README.rst b/README.rst index fbdfdaa95ef4c..7a7bd41c42846 100644 --- a/README.rst +++ b/README.rst @@ -44,6 +44,10 @@ .. |PytestMinVersion| replace:: 5.3.1 .. |PlotlyMinVersion| replace:: 5.10.0 +================= +Scikit-learn-tree +================= + ``scikit-learn-tree`` is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is released under the name ``scikit-learn-tree`` to avoid confusion. @@ -94,8 +98,7 @@ Installing scikit-learn-tree ============================ Scikit-learn-tree is a maintained fork of scikit-learn, which extends the -tree submodule in a few ways documented in :ref:`changelog of the fork -`. +tree submodule in a few ways documented in `fork_changelog`_. We release versions of scikit-learn-tree in an analagous fashion to scikit-learn main. Due to maintenance resources, we only release on PyPi @@ -103,12 +106,11 @@ and recommend therefore installing with ``pip``. There are different ways to install scikit-learn-tree: - * :ref:`Install the latest official release `. This + * Install the latest official release `install_fork_release`_. This is the best approach for most users. It will provide a stable version and pre-built packages are available for most platforms. - * :ref:`Building the package from source - `. This is best for users who want the + * Building the package from source `install_source`_. This is best for users who want the latest-and-greatest features and aren't afraid of running brand-new code. This is also needed for users who wish to contribute to the project. @@ -119,9 +121,7 @@ Installing the latest release ----------------------------- We release wheels for common distributions and this is thus installable via pip. -.. prompt:: bash $ - - pip install scikit-learn-tree + pip install scikit-learn-tree This will install ``scikit-learn-tree`` under the namespace of ``sklearn``, which then can be used as a stand-in for any package that relies on the public API of ``sklearn``. @@ -146,9 +146,11 @@ features to the fork, the building from source instructions are exactly the same as that of scikit-learn main, so please refer to `scikit-learn documentation `_ for instructions on building from source. -Development =========== +Development +----------- + We welcome new contributors of all experience levels, specifically to maintain the fork. Any contributions that make sure our fork is "better in-line" with scikit-learn upstream, or improves the tree submodule in anyway will be appreciated. @@ -158,15 +160,17 @@ The scikit-learn community goals are to be helpful, welcoming, and effective. Th has detailed information about contributing code, documentation, tests, and more. We've included some basic information in this README. -.. _fork-changelog: -Major Changes of the Fork ========================= +.. _fork_changelog: + +Major Changes of the Fork +------------------------- + The purpose of this page is to illustrate some of the main features that ``scikit-learn-tree`` provides compared to ``scikit-learn``. It assumes a an understanding of core package ``scikit-learn`` and also decision trees -models. Please refer to our :ref:`installation instructions -` for installing ``scikit-learn-tree``. +models. Please refer to our installation instructions `install_fork_release`_ for installing ``scikit-learn-tree``. Scikit-learn-tree though operates as a stand-in for upstream ``scikit-learn``. It is used in packages exactly the same way and will support all features @@ -193,7 +197,7 @@ Candidate changes and PRs accepted into the fork are those that: Decision tree generalizations ----------------------------- -``Scikit-learn`` provides an axis-aligned :class:`~sklearn.tree.DecisionTreeClassifier` +``Scikit-learn`` provides an axis-aligned `sklearn.tree.DecisionTreeClassifier `_ decision tree model (classifier and regressor), which has a few fundamental limitations that prevent 3rd parties from utilizing the existing class, without forking a large amount of copy/pasted Python and Cython code. We highlight those limitations here @@ -239,8 +243,8 @@ Python API: random forests and their variants to scale to millions of samples. - Our fix: We added a ``max_bins=None`` keyword argument to the ``BaseForest`` class, and all its subclasses. The default behavior is no binning. The current implementation is not necessarily efficient. There are several improvements to be made. See below. -Overall, the existing tree models, such as :class:`~sklearn.tree.DecisionTreeClassifier` -and :class:`~sklearn.ensemble.RandomForestClassifier` all work exactly the same as they +Overall, the existing tree models, such as `sklearn.tree.DecisionTreeClassifier `_ +and `sklearn.ensemble.RandomForestClassifier `_ all work exactly the same as they would in ``scikit-learn`` main, but these extensions enable 3rd-party packages to extend the Cython/Python API easily. diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index db5b5d9414053..5238cd1121d2e 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -7,7 +7,7 @@ set -x source build_tools/shared.sh UNAMESTR=`uname` -CCACHE_LINKS_DIR="/tmp/ccachev2" +CCACHE_LINKS_DIR="/tmp/ccache" setup_ccache() { CCACHE_BIN=`which ccache || echo ""` diff --git a/build_tools/github/repair_windows_wheels.sh b/build_tools/github/repair_windows_wheels.sh index cdd0c0c79d8c4..a857e61067960 100755 --- a/build_tools/github/repair_windows_wheels.sh +++ b/build_tools/github/repair_windows_wheels.sh @@ -9,7 +9,7 @@ DEST_DIR=$2 # By default, the Windows wheels are not repaired. # In this case, we need to vendor VCRUNTIME140.dll wheel unpack "$WHEEL" -WHEEL_DIRNAME=$(ls -d scikit_learn-*) +WHEEL_DIRNAME=$(ls -d scikit_learn_tree-*) python build_tools/github/vendor.py "$WHEEL_DIRNAME" wheel pack "$WHEEL_DIRNAME" -d "$DEST_DIR" rm -rf "$WHEEL_DIRNAME" From 706a74273bf736066b1d71eeed9da08c0943e311 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 4 Apr 2023 14:47:24 -0700 Subject: [PATCH 03/39] Release v1.2.2 #### Reference Issues/PRs #### What does this implement/fix? Explain your changes. #### Any other comments? --------- Signed-off-by: Adam Li --- .github/workflows/check-upstream.yml | 27 +++++++++++++++++++++++++++ sklearn/__init__.py | 2 +- 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/check-upstream.yml diff --git a/.github/workflows/check-upstream.yml b/.github/workflows/check-upstream.yml new file mode 100644 index 0000000000000..80e8ace610607 --- /dev/null +++ b/.github/workflows/check-upstream.yml @@ -0,0 +1,27 @@ +# Create Github Actions workflow that checks upstream scikit-learn 'main' branch and +# creates or updates +# an existing pull request to https://github.com/neurodata/scikit-learn:fork. +# Runs the check weekly. +# Creates a pull request if there are changes. + +# name: Check upstream scikit-learn + +# on: +# schedule: +# - cron: '0 0 * * 0' + +# jobs: +# check-upstream: +# runs-on: ubuntu-latest +# steps: +# - uses: actions/checkout@v2 +# - name: Check upstream scikit-learn +# uses: neurodata/check-upstream@main +# with: +# upstream: scikit-learn/scikit-learn +# fork: neurodata/scikit-learn +# branch: fork +# token: ${{ secrets.GITHUB_TOKEN }} + +# # Creates a pull request if there are changes. + diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 47bb893bd00a0..6d5af7c771fb8 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -39,7 +39,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = "1.3.dev0" +__version__ = "1.2.2" # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded From a22db039704399a31d466be861f2b5a86bbc51b3 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 11 Apr 2023 15:25:44 -0400 Subject: [PATCH 04/39] Update README Signed-off-by: Adam Li --- README.rst | 4 ++-- sklearn/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 7a7bd41c42846..444ead93017b9 100644 --- a/README.rst +++ b/README.rst @@ -48,7 +48,7 @@ Scikit-learn-tree ================= -``scikit-learn-tree`` is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line +``scikit-learn-tree`` is an alias of scikit-learn. It is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is released under the name ``scikit-learn-tree`` to avoid confusion. @@ -85,7 +85,7 @@ Installation Dependencies ~~~~~~~~~~~~ -scikit-learn requires: +scikit-learn-tree requires: - Python (>= |PythonMinVersion|) - NumPy (>= |NumPyMinVersion|) diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 6d5af7c771fb8..4d7badd6b678e 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -39,7 +39,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = "1.2.2" +__version__ = "1.3.0dev0" # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded From 9c5321daa396e0fd01cc6e582a5dfcc8ccb1afe5 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 8 Jun 2023 10:09:58 -0400 Subject: [PATCH 05/39] Adding working submodule Signed-off-by: Adam Li --- sklearn/tree/_tree.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 23b999d76326e..f4a1a80123d26 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1022,6 +1022,7 @@ cdef class BaseTree: cdef Node* end_node = node + self.node_count cdef double normalizer = 0. + cdef int i = 0 cdef cnp.float64_t[:] importances = np.zeros(self.n_features) From f82f2582c0c5e347fd9a6109129c3ae7853b0593 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 8 Jun 2023 10:40:52 -0400 Subject: [PATCH 06/39] Merged main Signed-off-by: Adam Li --- sklearn/tree/_splitter.pxd | 2 ++ sklearn/tree/_splitter.pyx | 4 ++-- sklearn/tree/_tree.pyx | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 3419c6fa08819..01975df22ef23 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -110,6 +110,8 @@ cdef class Splitter(BaseSplitter): cdef bint check_presplit_conditions( self, SplitRecord current_split, + SIZE_t n_missing, + bint missing_go_to_left, ) noexcept nogil cdef bint check_postsplit_conditions( self diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index c8df3de1bb900..ae6cd772e37f7 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -505,7 +505,7 @@ cdef inline int node_split_best( current_split.pos = p # Reject if min_samples_leaf is not guaranteed - if splitter.check_presplit_conditions(current_split) == 1: + if splitter.check_presplit_conditions(current_split, n_missing, missing_go_to_left) == 1: continue criterion.update(current_split.pos) @@ -834,7 +834,7 @@ cdef inline int node_split_random( current_split.pos = partitioner.partition_samples(current_split.threshold) # Reject if min_samples_leaf is not guaranteed - if splitter.check_presplit_conditions(current_split) == 1: + if splitter.check_presplit_conditions(current_split, 0, 0) == 1: continue # Evaluate split diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index c8248ed65c36b..33a2a8308de5f 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -764,7 +764,7 @@ cdef class BaseTree: # While node not a leaf while node.left_child != _TREE_LEAF: - X_i_node_features = self._compute_feature(X_ndarray, i, node) + X_i_node_feature = self._compute_feature(X_ndarray, i, node) # ... and node.right_child != _TREE_LEAF: if isnan(X_i_node_feature): if node.missing_go_to_left: From 7e38502806e954d9b3084f8a5e22602556236fe4 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 8 Jun 2023 10:42:44 -0400 Subject: [PATCH 07/39] Successful merge with the missing value support Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 2 ++ sklearn/tree/tests/test_tree.py | 32 ++++++++++++++------------------ 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 638c51f1101bc..21fa5b7c200b2 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -388,6 +388,7 @@ def _fit( X, y, sample_weight, + feature_has_missing, min_samples_leaf, min_weight_leaf, max_leaf_nodes, @@ -403,6 +404,7 @@ def _build_tree( X, y, sample_weight, + feature_has_missing, min_samples_leaf, min_weight_leaf, max_leaf_nodes, diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 6be168e4c8e7c..eefae6cdaa3f6 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -300,7 +300,7 @@ def test_xor(): clf.fit(X, y) assert clf.score(X, y) == 1.0, "Failed with {0}".format(name) - clf = Tree(random_state=0, max_features=X.shape[1]) + clf = Tree(random_state=0, max_features=1) clf.fit(X, y) assert clf.score(X, y) == 1.0, "Failed with {0}".format(name) @@ -440,7 +440,7 @@ def test_importances(): X, y = datasets.make_classification( n_samples=5000, n_features=10, - n_informative=4, + n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, @@ -455,7 +455,7 @@ def test_importances(): n_important = np.sum(importances > 0.1) assert importances.shape[0] == 10, "Failed with {0}".format(name) - assert n_important == 4, "Failed with {0}".format(name) + assert n_important == 3, "Failed with {0}".format(name) # Check on iris that importances are the same for all builders clf = DecisionTreeClassifier(random_state=0) @@ -466,9 +466,9 @@ def test_importances(): assert_array_equal(clf.feature_importances_, clf2.feature_importances_) -@pytest.mark.parametrize("clf", [DecisionTreeClassifier()]) -def test_importances_raises(clf): +def test_importances_raises(): # Check if variable importance before fit raises ValueError. + clf = DecisionTreeClassifier() with pytest.raises(ValueError): getattr(clf, "feature_importances_") @@ -653,7 +653,6 @@ def test_min_samples_leaf(): est.fit(X, y) out = est.tree_.apply(X) node_counts = np.bincount(out) - # drop inner nodes leaf_count = node_counts[node_counts != 0] assert np.min(leaf_count) > 4, "Failed with {0}".format(name) @@ -678,7 +677,7 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False): else: X = DATASETS[datasets]["X"].astype(np.float32) y = DATASETS[datasets]["y"] - rng = np.random.RandomState(42) + weights = rng.rand(X.shape[0]) total_weight = np.sum(weights) @@ -829,7 +828,7 @@ def test_min_impurity_decrease(): ) # Check with a much lower value of 0.0001 est3 = TreeEstimator( - max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=1 + max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=0 ) # Check with a much lower value of 0.1 est4 = TreeEstimator( @@ -919,7 +918,6 @@ def test_pickle(): est2 = pickle.loads(serialized_object) assert type(est2) == est.__class__ - # score should match before/after pickling score2 = est2.score(X, y) assert ( score == score2 @@ -1033,6 +1031,7 @@ def test_memory_layout(): ALL_TREES.items(), [np.float64, np.float32] ): est = TreeEstimator(random_state=0) + # Nothing X = np.asarray(iris.data, dtype=dtype) y = iris.target @@ -1053,11 +1052,6 @@ def test_memory_layout(): y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) - # Strided - X = np.asarray(iris.data[::3], dtype=dtype) - y = iris.target[::3] - assert_array_equal(est.fit(X, y).predict(X), y) - # csr matrix X = csr_matrix(iris.data, dtype=dtype) y = iris.target @@ -1068,6 +1062,11 @@ def test_memory_layout(): y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) + # Strided + X = np.asarray(iris.data[::3], dtype=dtype) + y = iris.target[::3] + assert_array_equal(est.fit(X, y).predict(X), y) + def test_sample_weight(): # Check sample weighting. @@ -1261,7 +1260,7 @@ def test_behaviour_constant_feature_after_splits(): y = [0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3] for name, TreeEstimator in ALL_TREES.items(): # do not check extra random trees - if all(_name not in name for _name in ["ExtraTree"]): + if "ExtraTree" not in name: est = TreeEstimator(random_state=0, max_features=1) est.fit(X, y) assert est.tree_.max_depth == 2 @@ -1587,7 +1586,6 @@ def check_min_weight_leaf_split_level(name): sample_weight = [0.2, 0.2, 0.2, 0.2, 0.2] _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight) - # skip for sparse inputs _check_min_weight_leaf_split_level(TreeEstimator, csc_matrix(X), y, sample_weight) @@ -1646,7 +1644,6 @@ def check_decision_path(name): # Assert that leaves index are correct leaves = est.apply(X) leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)] - assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples)) # Ensure only one leave node per sample @@ -1933,7 +1930,6 @@ def assert_is_subtree(tree, subtree): def test_apply_path_readonly_all_trees(name, splitter, X_format): dataset = DATASETS["clf_small"] X_small = dataset["X"].astype(tree._tree.DTYPE, copy=False) - if X_format == "dense": X_readonly = create_memmap_backed_data(X_small) else: From 34a562130d9c92b083b6da99c27a12a7623226b7 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 8 Jun 2023 10:53:07 -0400 Subject: [PATCH 08/39] Add cyton headers Signed-off-by: Adam Li --- sklearn/tree/_criterion.pyx | 3 +++ sklearn/tree/_splitter.pyx | 3 +++ sklearn/tree/_tree.pyx | 3 +++ sklearn/tree/_utils.pyx | 3 +++ 4 files changed, 12 insertions(+) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 9c59e75fedb10..8fbcafcaf1456 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index ae6cd772e37f7..a58514d093ddf 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 33a2a8308de5f..2256b28c7df10 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 669d69409fdc3..0a7522bcf4255 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Arnaud Joly From f35c758189c8d38bfed56071b8c9a6cbbd39056f Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 8 Jun 2023 14:04:19 -0400 Subject: [PATCH 09/39] Fix imports to be absolute Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 32 ++++++++++++++++---------------- sklearn/tree/_export.py | 11 ++++++++--- sklearn/tree/_utils.pxd | 2 +- sklearn/tree/_utils.pyx | 2 +- 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 21fa5b7c200b2..4fdd8f27cd652 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -25,22 +25,22 @@ import numpy as np from scipy.sparse import issparse -from ..base import BaseEstimator -from ..base import ClassifierMixin -from ..base import clone -from ..base import RegressorMixin -from ..base import is_classifier -from ..base import MultiOutputMixin -from ..utils import Bunch -from ..utils import check_random_state -from ..utils.validation import _check_sample_weight -from ..utils.validation import assert_all_finite -from ..utils.validation import _assert_all_finite_element_wise -from ..utils import compute_sample_weight -from ..utils.multiclass import check_classification_targets -from ..utils.validation import check_is_fitted -from ..utils._param_validation import Hidden, Interval, StrOptions -from ..utils._param_validation import RealNotInt +from sklearn.base import BaseEstimator +from sklearn.base import ClassifierMixin +from sklearn.base import clone +from sklearn.base import RegressorMixin +from sklearn.base import is_classifier +from sklearn.base import MultiOutputMixin +from sklearn.utils import Bunch +from sklearn.utils import check_random_state +from sklearn.utils.validation import _check_sample_weight +from sklearn.utils.validation import assert_all_finite +from sklearn.utils.validation import _assert_all_finite_element_wise +from sklearn.utils import compute_sample_weight +from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import check_is_fitted +from sklearn.utils._param_validation import Hidden, Interval, StrOptions +from sklearn.utils._param_validation import RealNotInt from ._criterion import BaseCriterion from ._splitter import BaseSplitter diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index e8dbe51138223..be545de0202d0 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -16,10 +16,15 @@ import numpy as np -from ..utils.validation import check_is_fitted, check_array -from ..utils._param_validation import Interval, validate_params, StrOptions, HasMethods +from sklearn.utils.validation import check_is_fitted, check_array +from sklearn.utils._param_validation import ( + Interval, + validate_params, + StrOptions, + HasMethods, +) -from ..base import is_classifier +from sklearn.base import is_classifier from . import _criterion from . import _tree diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 4938d3030245f..f7bae4c5c8553 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -10,7 +10,7 @@ cimport numpy as cnp from ._tree cimport Node -from ..neighbors._quad_tree cimport Cell +from sklearn.neighbors._quad_tree cimport Cell ctypedef cnp.npy_float32 DTYPE_t # Type of X ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 0a7522bcf4255..bc7e17f8766d8 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -19,7 +19,7 @@ import numpy as np cimport numpy as cnp cnp.import_array() -from ..utils._random cimport our_rand_r +from sklearn.utils._random cimport our_rand_r # ============================================================================= # Helper functions From 45320b4d3ef05b4ccbe81e8c13676b1c755d1973 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 8 Jun 2023 14:17:25 -0400 Subject: [PATCH 10/39] Fix forest import Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 4cc672bb6884d..4d9bf862bd806 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -50,11 +50,16 @@ class calls the ``fit`` method of each sub-estimator on random samples from scipy.sparse import issparse from scipy.sparse import hstack as sparse_hstack -from ..base import is_classifier -from ..base import ClassifierMixin, MultiOutputMixin, RegressorMixin, TransformerMixin +from sklearn.base import is_classifier +from sklearn.base import ( + ClassifierMixin, + MultiOutputMixin, + RegressorMixin, + TransformerMixin, +) -from ..metrics import accuracy_score, r2_score -from ..preprocessing import OneHotEncoder +from sklearn.metrics import accuracy_score, r2_score +from sklearn.preprocessing import OneHotEncoder from ..tree import ( BaseDecisionTree, DecisionTreeClassifier, @@ -63,21 +68,21 @@ class calls the ``fit`` method of each sub-estimator on random samples ExtraTreeRegressor, ) from ..tree._tree import DTYPE, DOUBLE -from ..utils import check_random_state, compute_sample_weight -from ..exceptions import DataConversionWarning -from ._base import BaseEnsemble, _partition_estimators -from ..utils.parallel import delayed, Parallel -from ..utils.multiclass import check_classification_targets, type_of_target -from ..utils.validation import ( +from sklearn.utils import check_random_state, compute_sample_weight +from sklearn.exceptions import DataConversionWarning +from sklearn.ensemble._base import BaseEnsemble, _partition_estimators +from sklearn.utils.parallel import delayed, Parallel +from sklearn.utils.multiclass import check_classification_targets, type_of_target +from sklearn.utils.validation import ( check_is_fitted, _check_sample_weight, _check_feature_names_in, ) -from ..utils._openmp_helpers import _openmp_effective_n_threads -from ..utils.validation import _num_samples -from ..utils._param_validation import Interval, StrOptions -from ..utils._param_validation import RealNotInt -from ._hist_gradient_boosting.binning import _BinMapper +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +from sklearn.utils.validation import _num_samples +from sklearn.utils._param_validation import Interval, StrOptions +from sklearn.utils._param_validation import RealNotInt +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper __all__ = [ "RandomForestClassifier", From 49526f026c46727aa272be7bdd7a44d0101c089f Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 13 Jun 2023 15:19:07 -0400 Subject: [PATCH 11/39] Fix classes and criterion Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 67 ++++++++++++++++++++++++++++++++++++ sklearn/tree/_criterion.pxd | 11 ++++-- sklearn/tree/_criterion.pyx | 68 +++++++++++++++++++++++++++++++++++-- 3 files changed, 141 insertions(+), 5 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 4fdd8f27cd652..795c68c8b5081 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -713,6 +713,73 @@ def feature_importances_(self): return self.tree_.compute_feature_importances() + def _get_y_for_leaves(self, X, sample_weight=None): + n_samples = X.shape[0] + + # get the predictions + X_leaves = self.apply(X) + + bootstrap_indices = np.empty(shape, dtype=np.int64) + for i, estimator in enumerate(self.estimators_): + # Get bootstrap indices. + if self.bootstrap: + n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples) + bootstrap_indices[:, i] = _generate_sample_indices( + estimator.random_state, n_samples, n_samples_bootstrap + ) + else: + bootstrap_indices[:, i] = np.arange(n_samples) + + # Get predictions on bootstrap indices. + X_leaves[:, i] = X_leaves[bootstrap_indices[:, i], i] + + if sorter is not None: + # Reassign bootstrap indices to account for target sorting. + bootstrap_indices = np.argsort(sorter)[bootstrap_indices] + + bootstrap_indices += 1 # for sparse matrix (0s as empty) + + # Get the maximum number of nodes (internal + leaves) across trees. + # Get the maximum number of samples per leaf across trees (if needed). + max_node_count = 0 + max_samples_leaf = 0 if not leaf_subsample else max_samples_leaf + for i, estimator in enumerate(self.estimators_): + node_count = estimator.tree_.node_count + if node_count > max_node_count: + max_node_count = node_count + if not leaf_subsample: + sample_count = np.max(np.bincount(X_leaves[:, i])) + if sample_count > max_samples_leaf: + max_samples_leaf = sample_count + + # Initialize NumPy array (more efficient serialization than dict/list). + shape = (self.n_estimators, max_node_count, max_samples_leaf) + y_train_leaves = np.zeros(shape, dtype=np.int64) + + for i, estimator in enumerate(self.estimators_): + # Group training indices by leaf node. + leaf_indices, leaf_values_list = _group_by_value(X_leaves[:, i]) + + if leaf_subsample: + random.seed(estimator.random_state) + + # Map each leaf node to its list of training indices. + for leaf_idx, leaf_values in zip(leaf_indices, leaf_values_list): + y_indices = bootstrap_indices[:, i][leaf_values] + + if sample_weight is not None: + y_indices = y_indices[sample_weight[y_indices - 1] > 0] + + # Subsample leaf training indices (without replacement). + if leaf_subsample and max_samples_leaf < len(y_indices): + if not isinstance(y_indices, list): + y_indices = list(y_indices) + y_indices = random.sample(y_indices, max_samples_leaf) + + y_train_leaves[i, leaf_idx, : len(y_indices)] = y_indices + + return y_train_leaves + # ============================================================================= # Public estimators diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 6cfc33c5bdcea..d72f22f8b348d 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -11,6 +11,8 @@ # See _criterion.pyx for implementation details. +# from libcpp.vector cimport vector + from ._tree cimport DTYPE_t # Type of X from ._tree cimport DOUBLE_t # Type of y, sample_weight from ._tree cimport SIZE_t # Type for indices and counters @@ -19,7 +21,7 @@ from ._tree cimport UINT32_t # Unsigned 32 bit integer cdef class BaseCriterion: - """Abstract interface for criterion.""" + """Abstract interface for criterion.""" # Internal structures cdef const DOUBLE_t[:] sample_weight # Sample weights @@ -70,13 +72,18 @@ cdef class BaseCriterion: SIZE_t end ) noexcept nogil + # cdef void node_samples( + # self, + # vector[vector[DOUBLE_t]]* dest + # ) noexcept nogil + cdef class Criterion(BaseCriterion): """Abstract interface for supervised impurity criteria.""" cdef const DOUBLE_t[:, ::1] y # Values of y cdef SIZE_t n_missing # Number of missing values for the feature being evaluated cdef bint missing_go_to_left # Whether missing values go to the left node - + cdef int init( self, const DOUBLE_t[:, ::1] y, diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 8fbcafcaf1456..e9c02ab2fa43d 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -39,10 +39,13 @@ cdef class BaseCriterion: covariates, or labels, or both. Although scikit-learn currently only contains supervised tree methods, this class enables 3rd party packages to leverage scikit-learn's Cython code for criteria. + The downstream classes _must_ implement methods to compute the impurity in current node and in children nodes. + This object stores methods on how to calculate how good a split is using a set API. + Samples in the "current" node are stored in `samples[start:end]` which is partitioned around `pos` (an index in `start:end`) so that: - the samples of left child node are stored in `samples[start:pos]` @@ -56,21 +59,25 @@ cdef class BaseCriterion: cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. + This method must be implemented by the subclass. """ pass cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. + This method must be implemented by the subclass. """ pass cdef int update(self, SIZE_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. + This updates the collected statistics by moving sample_indices[pos:new_pos] from the right child to the left child. It must be implemented by the subclass. + Parameters ---------- new_pos : SIZE_t @@ -80,6 +87,7 @@ cdef class BaseCriterion: cdef double node_impurity(self) noexcept nogil: """Placeholder for calculating the impurity of the node. + Placeholder for a method which will evaluate the impurity of the current node, i.e. the impurity of sample_indices[start:end]. This is the primary function of the criterion class. The smaller the impurity the @@ -90,9 +98,11 @@ cdef class BaseCriterion: cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Placeholder for calculating the impurity of children. + Placeholder for a method which evaluates the impurity in children nodes, i.e. the impurity of sample_indices[start:pos] + the impurity of sample_indices[pos:end]. + Parameters ---------- impurity_left : double pointer @@ -106,8 +116,10 @@ cdef class BaseCriterion: cdef void node_value(self, double* dest) noexcept nogil: """Placeholder for storing the node value. + Placeholder for a method which will compute the node value of sample_indices[start:end] and save the value into dest. + Parameters ---------- dest : double pointer @@ -117,10 +129,12 @@ cdef class BaseCriterion: cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. + This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. + The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ @@ -135,6 +149,7 @@ cdef class BaseCriterion: double impurity_left, double impurity_right) noexcept nogil: """Compute the improvement in impurity. + This method computes the improvement in impurity when a split occurs. The weighted impurity improvement equation is the following: N_t / N * (impurity - N_t_R / N_t * right_impurity @@ -142,6 +157,7 @@ cdef class BaseCriterion: where N is the total number of samples, N_t is the number of samples at the current node, N_t_L is the number of samples in the left child, and N_t_R is the number of samples in the right child, + Parameters ---------- impurity_parent : double @@ -150,6 +166,7 @@ cdef class BaseCriterion: The impurity of the left child impurity_right : double The impurity of the right child + Return ------ double : improvement in impurity after the split occurs @@ -166,10 +183,12 @@ cdef class BaseCriterion: SIZE_t end ) noexcept nogil: """Abstract method which will set sample pointers in the criterion. + The dataset array that we compute criteria on is assumed to consist of 'N' ordered samples or rows (i.e. sorted). Since we pass this by reference, we use sample pointers to move the start and end around to consider only a subset of data. This function should also update relevant statistics that the class uses to compute the final criterion. + Parameters ---------- start : SIZE_t @@ -182,11 +201,13 @@ cdef class BaseCriterion: cdef class Criterion(BaseCriterion): """Interface for impurity criteria. + The supervised criterion computes the impurity of a node and the reduction of impurity of a split on that node using the distribution of labels in parent and - children nodes. It also computes the output statistics - such as the mean in regression and class probabilities in classification. - Instances of this class are responsible for compute splits' impurity difference + children nodes. It also computes the output statistics such as the mean in regression + and class probabilities in classification. Instances of this class are responsible + for compute splits' impurity difference. + Criterion is the base class for criteria used in supervised tree-based models with a homogeneous float64-dtyped y. """ @@ -198,8 +219,10 @@ cdef class Criterion(BaseCriterion): const SIZE_t[:] sample_indices ) except -1 nogil: """Placeholder for a method which will initialize the criterion. + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. + Parameters ---------- y : ndarray, dtype=DOUBLE_t @@ -279,6 +302,7 @@ cdef class ClassificationCriterion(Criterion): def __cinit__(self, SIZE_t n_outputs, cnp.ndarray[SIZE_t, ndim=1] n_classes): """Initialize attributes for this criterion. + Parameters ---------- n_outputs : SIZE_t @@ -331,8 +355,10 @@ cdef class ClassificationCriterion(Criterion): const SIZE_t[:] sample_indices ) except -1 nogil: """Initialize the criterion. + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. + Parameters ---------- y : ndarray, dtype=DOUBLE_t @@ -426,6 +452,7 @@ cdef class ClassificationCriterion(Criterion): cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -442,6 +469,7 @@ cdef class ClassificationCriterion(Criterion): cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -458,8 +486,10 @@ cdef class ClassificationCriterion(Criterion): cdef int update(self, SIZE_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. + Parameters ---------- new_pos : SIZE_t @@ -532,6 +562,7 @@ cdef class ClassificationCriterion(Criterion): cdef void node_value(self, double* dest) noexcept nogil: """Compute the node value of sample_indices[start:end] and save it into dest. + Parameters ---------- dest : double pointer @@ -546,17 +577,20 @@ cdef class ClassificationCriterion(Criterion): cdef class Entropy(ClassificationCriterion): r"""Cross Entropy impurity criterion. + This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k) be the proportion of class k observations in node m. + The cross-entropy is then defined as cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k) """ cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. + Evaluate the cross-entropy criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -578,8 +612,10 @@ cdef class Entropy(ClassificationCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. + i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). + Parameters ---------- impurity_left : double pointer @@ -611,11 +647,13 @@ cdef class Entropy(ClassificationCriterion): cdef class Gini(ClassificationCriterion): r"""Gini Index impurity criterion. + This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k) be the proportion of class k observations in node m. + The Gini Index is then defined as: index = \sum_{k=0}^{K-1} count_k (1 - count_k) = 1 - \sum_{k=0}^{K-1} count_k ** 2 @@ -623,6 +661,7 @@ cdef class Gini(ClassificationCriterion): cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. + Evaluate the Gini criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -648,8 +687,10 @@ cdef class Gini(ClassificationCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. + i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]) using the Gini index. + Parameters ---------- impurity_left : double pointer @@ -726,6 +767,7 @@ cdef inline void _move_sums_regression( cdef class RegressionCriterion(Criterion): r"""Abstract regression criterion. + This handles cases where the target is a continuous value, and is evaluated by computing the variance of the target values left and right of the split point. The computation takes linear time with `n_samples` @@ -736,6 +778,7 @@ cdef class RegressionCriterion(Criterion): def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): """Initialize parameters for this criterion. + Parameters ---------- n_outputs : SIZE_t @@ -961,6 +1004,7 @@ cdef class MSE(RegressionCriterion): cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. + Evaluate the MSE criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -976,10 +1020,12 @@ cdef class MSE(RegressionCriterion): cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. + This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. + The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. The MSE proxy is derived from @@ -1002,6 +1048,7 @@ cdef class MSE(RegressionCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. + i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). """ @@ -1045,6 +1092,7 @@ cdef class MSE(RegressionCriterion): cdef class MAE(RegressionCriterion): r"""Mean absolute error impurity criterion. + MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true value and f_i is the predicted value.""" @@ -1056,6 +1104,7 @@ cdef class MAE(RegressionCriterion): def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): """Initialize parameters for this criterion. + Parameters ---------- n_outputs : SIZE_t @@ -1154,6 +1203,7 @@ cdef class MAE(RegressionCriterion): cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -1184,6 +1234,7 @@ cdef class MAE(RegressionCriterion): cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -1211,6 +1262,7 @@ cdef class MAE(RegressionCriterion): cdef int update(self, SIZE_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left. + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -1273,6 +1325,7 @@ cdef class MAE(RegressionCriterion): cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. + Evaluate the MAE criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -1297,6 +1350,7 @@ cdef class MAE(RegressionCriterion): cdef void children_impurity(self, double* p_impurity_left, double* p_impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. + i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). """ @@ -1343,6 +1397,7 @@ cdef class MAE(RegressionCriterion): cdef class FriedmanMSE(MSE): """Mean squared error impurity criterion with improvement score by Friedman. + Uses the formula (35) in Friedman's original Gradient Boosting paper: diff = mean_left - mean_right improvement = n_left * n_right * diff^2 / (n_left + n_right) @@ -1350,10 +1405,12 @@ cdef class FriedmanMSE(MSE): cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. + This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. + The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ @@ -1394,6 +1451,7 @@ cdef class FriedmanMSE(MSE): cdef class Poisson(RegressionCriterion): """Half Poisson deviance as impurity criterion. + Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true) Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)` at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the @@ -1413,6 +1471,7 @@ cdef class Poisson(RegressionCriterion): cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. + Evaluate the Poisson criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -1422,10 +1481,12 @@ cdef class Poisson(RegressionCriterion): cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. + This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. + The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. The Poisson proxy is derived from: @@ -1463,6 +1524,7 @@ cdef class Poisson(RegressionCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. + i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity of the right child (sample_indices[pos:end]) for Poisson. """ From 2105949178bf03660c13df1fd197abbbb57d826e Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 13 Jun 2023 15:22:15 -0400 Subject: [PATCH 12/39] Working.. Signed-off-by: Adam Li --- sklearn/tree/_criterion.pxd | 2 +- sklearn/tree/_criterion.pyx | 4 +++- sklearn/tree/_splitter.pxd | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index d72f22f8b348d..20020b4a5361c 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -11,7 +11,7 @@ # See _criterion.pyx for implementation details. -# from libcpp.vector cimport vector +from libcpp.vector cimport vector from ._tree cimport DTYPE_t # Type of X from ._tree cimport DOUBLE_t # Type of y, sample_weight diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index e9c02ab2fa43d..d60cab3063c1b 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -34,7 +34,9 @@ from ._utils cimport WeightedMedianCalculator cdef double EPSILON = 10 * np.finfo('double').eps cdef class BaseCriterion: - """This is an abstract interface for criterion. For example, a tree model could + """This is an abstract interface for criterion. + + For example, a tree model could be either supervisedly, or unsupervisedly computing impurity on samples of covariates, or labels, or both. Although scikit-learn currently only contains supervised tree methods, this class enables 3rd party packages to leverage diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 01975df22ef23..fc49471569ecc 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -97,7 +97,7 @@ cdef class BaseSplitter: cdef class Splitter(BaseSplitter): cdef public Criterion criterion # Impurity criterion cdef const DOUBLE_t[:, ::1] y - + cdef int init( self, object X, From 9b07f2ab2b1b6f8f4ea1294fce1a5f9bd3be1a1d Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 13 Jun 2023 15:42:37 -0400 Subject: [PATCH 13/39] Add leaf storage ability Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 67 ---------------------------- sklearn/tree/_criterion.pxd | 9 ++-- sklearn/tree/_criterion.pyx | 28 +++++++++--- sklearn/tree/_splitter.pxd | 3 ++ sklearn/tree/_splitter.pyx | 31 ++++++------- sklearn/tree/_tree.pxd | 19 ++++++-- sklearn/tree/_tree.pyx | 88 +++++++++++++++++++++++++------------ 7 files changed, 122 insertions(+), 123 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 795c68c8b5081..4fdd8f27cd652 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -713,73 +713,6 @@ def feature_importances_(self): return self.tree_.compute_feature_importances() - def _get_y_for_leaves(self, X, sample_weight=None): - n_samples = X.shape[0] - - # get the predictions - X_leaves = self.apply(X) - - bootstrap_indices = np.empty(shape, dtype=np.int64) - for i, estimator in enumerate(self.estimators_): - # Get bootstrap indices. - if self.bootstrap: - n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples) - bootstrap_indices[:, i] = _generate_sample_indices( - estimator.random_state, n_samples, n_samples_bootstrap - ) - else: - bootstrap_indices[:, i] = np.arange(n_samples) - - # Get predictions on bootstrap indices. - X_leaves[:, i] = X_leaves[bootstrap_indices[:, i], i] - - if sorter is not None: - # Reassign bootstrap indices to account for target sorting. - bootstrap_indices = np.argsort(sorter)[bootstrap_indices] - - bootstrap_indices += 1 # for sparse matrix (0s as empty) - - # Get the maximum number of nodes (internal + leaves) across trees. - # Get the maximum number of samples per leaf across trees (if needed). - max_node_count = 0 - max_samples_leaf = 0 if not leaf_subsample else max_samples_leaf - for i, estimator in enumerate(self.estimators_): - node_count = estimator.tree_.node_count - if node_count > max_node_count: - max_node_count = node_count - if not leaf_subsample: - sample_count = np.max(np.bincount(X_leaves[:, i])) - if sample_count > max_samples_leaf: - max_samples_leaf = sample_count - - # Initialize NumPy array (more efficient serialization than dict/list). - shape = (self.n_estimators, max_node_count, max_samples_leaf) - y_train_leaves = np.zeros(shape, dtype=np.int64) - - for i, estimator in enumerate(self.estimators_): - # Group training indices by leaf node. - leaf_indices, leaf_values_list = _group_by_value(X_leaves[:, i]) - - if leaf_subsample: - random.seed(estimator.random_state) - - # Map each leaf node to its list of training indices. - for leaf_idx, leaf_values in zip(leaf_indices, leaf_values_list): - y_indices = bootstrap_indices[:, i][leaf_values] - - if sample_weight is not None: - y_indices = y_indices[sample_weight[y_indices - 1] > 0] - - # Subsample leaf training indices (without replacement). - if leaf_subsample and max_samples_leaf < len(y_indices): - if not isinstance(y_indices, list): - y_indices = list(y_indices) - y_indices = random.sample(y_indices, max_samples_leaf) - - y_train_leaves[i, leaf_idx, : len(y_indices)] = y_indices - - return y_train_leaves - # ============================================================================= # Public estimators diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 20020b4a5361c..721b475f40436 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -72,10 +72,6 @@ cdef class BaseCriterion: SIZE_t end ) noexcept nogil - # cdef void node_samples( - # self, - # vector[vector[DOUBLE_t]]* dest - # ) noexcept nogil cdef class Criterion(BaseCriterion): """Abstract interface for supervised impurity criteria.""" @@ -94,6 +90,11 @@ cdef class Criterion(BaseCriterion): cdef void init_sum_missing(self) cdef void init_missing(self, SIZE_t n_missing) noexcept nogil + cdef void node_samples( + self, + vector[vector[DOUBLE_t]]* dest + ) noexcept nogil + cdef class ClassificationCriterion(Criterion): """Abstract criterion for classification.""" diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index d60cab3063c1b..c3f08ec859bee 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -46,7 +46,7 @@ cdef class BaseCriterion: in current node and in children nodes. This object stores methods on how to calculate how good a split is using - a set API. + a set API. Samples in the "current" node are stored in `samples[start:end]` which is partitioned around `pos` (an index in `start:end`) so that: @@ -186,9 +186,9 @@ cdef class BaseCriterion: ) noexcept nogil: """Abstract method which will set sample pointers in the criterion. - The dataset array that we compute criteria on is assumed to consist of 'N' - ordered samples or rows (i.e. sorted). Since we pass this by reference, we - use sample pointers to move the start and end around to consider only a subset of data. + The dataset array that we compute criteria on is assumed to consist of 'N' + ordered samples or rows (i.e. sorted). Since we pass this by reference, we + use sample pointers to move the start and end around to consider only a subset of data. This function should also update relevant statistics that the class uses to compute the final criterion. Parameters @@ -252,10 +252,28 @@ cdef class Criterion(BaseCriterion): Number of missing values for specific feature. """ pass - + cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" + cdef void node_samples( + self, + vector[vector[DOUBLE_t]]* dest + ) noexcept nogil: + cdef SIZE_t i, j + + # Resize the destination vector of vectors + dest.resize(self.n_node_samples) + + # Loop over the samples + for i in range(self.n_node_samples): + # Get the index of the current sample + j = self.sample_indices[self.start + i] + + # Get the sample values for each output + for k in range(self.n_outputs): + dest[i][k].push_back(self.y[j, k]) + cdef inline void _move_sums_classification( ClassificationCriterion criterion, double[:, ::1] sum_1, diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index fc49471569ecc..fb21f676e66cc 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -10,6 +10,7 @@ # License: BSD 3 clause # See _splitter.pyx for details. +from libcpp.vector cimport vector from ._criterion cimport BaseCriterion, Criterion @@ -106,6 +107,8 @@ cdef class Splitter(BaseSplitter): const unsigned char[::1] feature_has_missing, ) except -1 + cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil + # Methods that allow modifications to stopping conditions cdef bint check_presplit_conditions( self, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index a58514d093ddf..7f21d5da545fb 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -53,12 +53,12 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) noexcept nogil self.n_missing = 0 cdef class BaseSplitter: - """This is an abstract interface for splitters. + """This is an abstract interface for splitters. For example, a tree model could be either supervisedly, or unsupervisedly computing splits on samples of covariates, labels, or both. Although scikit-learn currently only contains supervised tree methods, this class enables 3rd party packages to leverage - scikit-learn's Cython code for splitting. + scikit-learn's Cython code for splitting. A splitter is usually used in conjunction with a criterion class, which explicitly handles computing the criteria, which we split on. The setting of that criterion class is handled @@ -112,7 +112,7 @@ cdef class BaseSplitter: cdef int pointer_size(self) noexcept nogil: """Size of the pointer for split records. - + Overriding this function allows one to use different subclasses of `SplitRecord`. """ @@ -156,7 +156,6 @@ cdef class Splitter(BaseSplitter): self.min_weight_leaf = min_weight_leaf self.random_state = random_state - def __reduce__(self): return (type(self), (self.criterion, self.max_features, @@ -281,6 +280,10 @@ cdef class Splitter(BaseSplitter): self.criterion.node_value(dest) + cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil: + """Copy the samples[start:end] into dest.""" + self.criterion.node_samples(dest) + cdef double node_impurity(self) noexcept nogil: """Return the impurity of the current node.""" @@ -293,7 +296,7 @@ cdef class Splitter(BaseSplitter): bint missing_go_to_left, ) noexcept nogil: """Check stopping conditions pre-split. - + This is typically a metric that is cheaply computed given the current proposed split, which is stored as a the `current_split` argument. @@ -301,7 +304,7 @@ cdef class Splitter(BaseSplitter): cdef SIZE_t min_samples_leaf = self.min_samples_leaf cdef SIZE_t end_non_missing = self.end - n_missing cdef SIZE_t n_left, n_right - + if missing_go_to_left: n_left = current_split.pos - self.start + n_missing n_right = end_non_missing - current_split.pos @@ -312,14 +315,14 @@ cdef class Splitter(BaseSplitter): # Reject if min_samples_leaf is not guaranteed if n_left < min_samples_leaf or n_right < min_samples_leaf: return 1 - + return 0 cdef bint check_postsplit_conditions( self ) noexcept nogil: """Check stopping conditions after evaluating the split. - + This takes some metric that is stored in the Criterion object and checks against internal stop metrics. """ @@ -329,10 +332,10 @@ cdef class Splitter(BaseSplitter): if ((self.criterion.weighted_n_left < min_weight_leaf) or (self.criterion.weighted_n_right < min_weight_leaf)): return 1 - + return 0 - + cdef inline void shift_missing_values_to_left_if_required( SplitRecord* best, SIZE_t[::1] samples, @@ -360,7 +363,7 @@ cdef inline void shift_missing_values_to_left_if_required( ctypedef fused Partitioner: DensePartitioner SparsePartitioner - + cdef inline int node_split_best( Splitter splitter, Partitioner partitioner, @@ -504,9 +507,9 @@ cdef inline int node_split_best( if p >= end_non_missing: continue - + current_split.pos = p - + # Reject if min_samples_leaf is not guaranteed if splitter.check_presplit_conditions(current_split, n_missing, missing_go_to_left) == 1: continue @@ -740,8 +743,6 @@ cdef inline int node_split_random( cdef SIZE_t n_features = splitter.n_features cdef SIZE_t max_features = splitter.max_features - cdef SIZE_t min_samples_leaf = splitter.min_samples_leaf - cdef double min_weight_leaf = splitter.min_weight_leaf cdef UINT32_t* random_state = &splitter.rand_r_state cdef SplitRecord best_split, current_split diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index cbe85886cd865..94714cc33400c 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -14,6 +14,7 @@ import numpy as np cimport numpy as cnp from libcpp.vector cimport vector +from libcpp.unordered_map cimport unordered_map ctypedef cnp.npy_float32 DTYPE_t # Type of X ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight @@ -36,6 +37,7 @@ cdef struct Node: DOUBLE_t weighted_n_node_samples # Weighted number of samples at the node unsigned char missing_go_to_left # Whether features have missing values + cdef class BaseTree: # Inner structures: values are stored separately from node structure, # since size is determined at runtime. @@ -45,7 +47,14 @@ cdef class BaseTree: cdef Node* nodes # Array of nodes cdef SIZE_t value_stride # The dimensionality of a vectorized output per sample - cdef double* value # Array of values prediction values for each node + cdef double* value # Array of values prediction values for each node + + # Enables the use of tree to store distributions of the output to allow + # arbitrary usage of the the leaves. This is used in the quantile + # estimators for example. + # for storing samples at each leaf node with leaf's node ID as the key and + # the sample values as the value + cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples # Generic Methods: These are generic methods used by any tree. cdef int _resize(self, SIZE_t capacity) except -1 nogil @@ -61,7 +70,7 @@ cdef class BaseTree: double weighted_n_node_samples, unsigned char missing_go_to_left ) except -1 nogil - + # Python API methods: These are methods exposed to Python cpdef cnp.ndarray apply(self, object X) cdef cnp.ndarray _apply_dense(self, object X) @@ -101,10 +110,10 @@ cdef class Tree(BaseTree): # The Supervised Tree object is a binary tree structure constructed by the # TreeBuilder. The tree structure is used for predictions and # feature importances. - # + # # Value of upstream properties: # - value_stride = n_outputs * max_n_classes - # - value = (capacity, n_outputs, max_n_classes) array of values + # - value = (capacity, n_outputs, max_n_classes) array of values # Input/Output layout for supervised tree cdef public SIZE_t n_features # Number of features in X @@ -137,6 +146,8 @@ cdef class TreeBuilder: cdef SIZE_t max_depth # Maximal tree depth cdef double min_impurity_decrease # Impurity threshold for early stopping + cdef unsigned char store_leaf_values # Whether to store leaf values + cpdef build( self, Tree tree, diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 2256b28c7df10..8ca98a64b42ab 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -158,15 +158,23 @@ cdef struct StackRecord: cdef class DepthFirstTreeBuilder(TreeBuilder): """Build a decision tree in depth-first fashion.""" - def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, - SIZE_t min_samples_leaf, double min_weight_leaf, - SIZE_t max_depth, double min_impurity_decrease): + def __cinit__( + self, + Splitter splitter, + SIZE_t min_samples_split, + SIZE_t min_samples_leaf, + double min_weight_leaf, + SIZE_t max_depth, + double min_impurity_decrease, + unsigned char store_leaf_values=False + ): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_leaf = min_weight_leaf self.max_depth = max_depth self.min_impurity_decrease = min_impurity_decrease + self.store_leaf_values = store_leaf_values cpdef build( self, @@ -221,6 +229,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef SIZE_t max_depth_seen = -1 cdef int rc = 0 + cdef int node_idx + cdef stack[StackRecord] builder_stack cdef StackRecord stack_record @@ -308,6 +318,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "is_left": 1, "impurity": split.impurity_left, "n_constant_features": n_constant_features}) + elif self.store_leaf_values and is_leaf: + with gil: + print('Storing leaf values...') + + # copy leaf values to leaf_values array + splitter.node_samples(&tree.value_samples[node_id]) if depth > max_depth_seen: max_depth_seen = depth @@ -317,7 +333,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if rc >= 0: tree.max_depth = max_depth_seen - + # free the memory created for the SplitRecord pointer free(split_ptr) @@ -364,10 +380,17 @@ cdef class BestFirstTreeBuilder(TreeBuilder): """ cdef SIZE_t max_leaf_nodes - def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, - SIZE_t min_samples_leaf, min_weight_leaf, - SIZE_t max_depth, SIZE_t max_leaf_nodes, - double min_impurity_decrease): + def __cinit__( + self, + Splitter splitter, + SIZE_t min_samples_split, + SIZE_t min_samples_leaf, + double min_weight_leaf, + SIZE_t max_depth, + SIZE_t max_leaf_nodes, + double min_impurity_decrease, + unsigned char store_leaf_values=False, + ): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf @@ -375,6 +398,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): self.max_depth = max_depth self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease + self.store_leaf_values = store_leaf_values cpdef build( self, @@ -488,7 +512,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): """Adds node w/ partition ``[start, end)`` to the frontier. """ cdef SplitRecord split cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) - + cdef SIZE_t node_id cdef SIZE_t n_node_samples cdef SIZE_t n_constant_features = 0 @@ -553,7 +577,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): res.improvement = 0.0 res.impurity_left = impurity res.impurity_right = impurity - + free(split_ptr) return 0 @@ -564,7 +588,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef class BaseTree: """Base class for Cython tree models. - + Downstream classes must implement """ cdef int _resize( @@ -622,7 +646,7 @@ cdef class BaseTree: Node* node ) except -1 nogil: """Set split node data. - + Parameters ---------- split_node : SplitRecord* @@ -641,7 +665,7 @@ cdef class BaseTree: Node* node ) except -1 nogil: """Set leaf node data. - + Parameters ---------- split_node : SplitRecord* @@ -655,9 +679,12 @@ cdef class BaseTree: node.threshold = _TREE_UNDEFINED return 1 - cdef DTYPE_t _compute_feature(self, const DTYPE_t[:, :] X_ndarray, - SIZE_t sample_index, - Node *node) noexcept nogil: + cdef DTYPE_t _compute_feature( + self, + const DTYPE_t[:, :] X_ndarray, + SIZE_t sample_index, + Node *node + ) noexcept nogil: """Compute feature from a given data matrix, X. In axis-aligned trees, this is simply the value in the column of X @@ -668,7 +695,7 @@ cdef class BaseTree: return feature cdef SIZE_t _add_node( - self, + self, SIZE_t parent, bint is_left, bint is_leaf, @@ -679,7 +706,9 @@ cdef class BaseTree: unsigned char missing_go_to_left ) except -1 nogil: """Add a node to the tree. + The new node registers itself as the child of its parent. + Parameters ---------- parent : SIZE_t @@ -697,7 +726,7 @@ cdef class BaseTree: The number of samples in the node. weighted_n_node_samples : double The weight of the samples in the node. - + Returns (size_t)(-1) on error. """ cdef SIZE_t node_id = self.node_count @@ -719,12 +748,12 @@ cdef class BaseTree: if is_leaf: if self._set_leaf_node(split_node, node) != 1: - with gil: - raise RuntimeError + with gil: + raise RuntimeError else: if self._set_split_node(split_node, node) != 1: - with gil: - raise RuntimeError + with gil: + raise RuntimeError node.missing_go_to_left = missing_go_to_left self.node_count += 1 @@ -796,8 +825,8 @@ cdef class BaseTree: # Extract input cdef const DTYPE_t[:] X_data = X.data - cdef const INT32_t[:] X_indices = X.indices - cdef const INT32_t[:] X_indptr = X.indptr + cdef const INT32_t[:] X_indices = X.indices + cdef const INT32_t[:] X_indptr = X.indptr cdef SIZE_t n_samples = X.shape[0] cdef SIZE_t n_features = X.shape[1] @@ -928,8 +957,8 @@ cdef class BaseTree: # Extract input cdef const DTYPE_t[:] X_data = X.data - cdef const INT32_t[:] X_indices = X.indices - cdef const INT32_t[:] X_indptr = X.indptr + cdef const INT32_t[:] X_indices = X.indices + cdef const INT32_t[:] X_indptr = X.indptr cdef SIZE_t n_samples = X.shape[0] cdef SIZE_t n_features = X.shape[1] @@ -1043,7 +1072,7 @@ cdef class BaseTree: # ... and node.right_child != _TREE_LEAF: self._compute_feature_importances( importances, node) - + node += 1 for i in range(self.n_features): @@ -1065,7 +1094,7 @@ cdef class BaseTree: Node* node ) noexcept nogil: """Compute feature importances from a Node in the Tree. - + Wrapped in a private function to allow subclassing that computes feature importances. """ @@ -1321,6 +1350,9 @@ cdef class Tree(BaseTree): self.value = NULL self.nodes = NULL + # initialize the hash map for the value samples + self.value_samples = unordered_map[SIZE_t, vector[vector[DOUBLE_t]]]() + def __dealloc__(self): """Destructor.""" # Free all inner structures From 21ccb30478bdff652118af59a4cd614a23f799d0 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 15 Jun 2023 10:35:44 -0400 Subject: [PATCH 14/39] [ENH] Adding leaf node samples to be stored when "quantile" tree is turned on (#45) #### Reference Issues/PRs Addresses the quantile-trees part of: https://github.com/neurodata/scikit-tree/issues/29 #### What does this implement/fix? Explain your changes. 1. Stores for each leaf node a 2D numpy array of the y-samples (remember `y` is (n_samples, n_outputs)) 2. Does this all the way in Criterion 3. Only supports supervised tree/splitter/criterion 4. merges in `main` changes. #### Any other comments? --------- Signed-off-by: Adam Li --- doc/authors_emeritus.rst | 1 + doc/contributor_experience_team.rst | 12 +- doc/modules/classes.rst | 1 + doc/modules/learning_curve.rst | 42 +- doc/visualizations.rst | 1 + doc/whats_new/v1.3.rst | 286 +++++--- .../plot_kernel_ridge_regression.py | 1 + .../model_selection/plot_validation_curve.py | 46 +- sklearn/base.py | 10 +- sklearn/calibration.py | 7 +- sklearn/cluster/_affinity_propagation.py | 4 +- sklearn/cluster/_agglomerative.py | 5 +- sklearn/cluster/_bicluster.py | 4 +- sklearn/cluster/_birch.py | 8 +- sklearn/cluster/_bisect_k_means.py | 4 +- sklearn/cluster/_dbscan.py | 7 +- sklearn/cluster/_feature_agglomeration.py | 34 +- sklearn/cluster/_kmeans.py | 11 +- sklearn/cluster/_mean_shift.py | 3 +- sklearn/cluster/_optics.py | 7 +- sklearn/cluster/_spectral.py | 4 +- .../tests/test_feature_agglomeration.py | 24 + sklearn/compose/_column_transformer.py | 7 +- sklearn/compose/_target.py | 6 +- sklearn/covariance/_elliptic_envelope.py | 3 +- sklearn/covariance/_empirical_covariance.py | 3 +- sklearn/covariance/_graph_lasso.py | 5 +- sklearn/covariance/_robust_covariance.py | 3 +- sklearn/covariance/_shrunk_covariance.py | 8 +- sklearn/cross_decomposition/_pls.py | 7 +- sklearn/datasets/_arff_parser.py | 5 +- sklearn/datasets/tests/test_openml.py | 4 +- sklearn/decomposition/_dict_learning.py | 7 +- sklearn/decomposition/_factor_analysis.py | 4 +- sklearn/decomposition/_fastica.py | 7 +- sklearn/decomposition/_incremental_pca.py | 8 +- sklearn/decomposition/_kernel_pca.py | 4 +- sklearn/decomposition/_lda.py | 7 +- sklearn/decomposition/_nmf.py | 44 +- sklearn/decomposition/_pca.py | 7 +- sklearn/decomposition/_sparse_pca.py | 3 +- sklearn/decomposition/_truncated_svd.py | 4 +- sklearn/decomposition/tests/test_nmf.py | 27 + sklearn/discriminant_analysis.py | 9 +- sklearn/dummy.py | 7 +- sklearn/ensemble/_bagging.py | 8 +- sklearn/ensemble/_forest.py | 174 ++++- sklearn/ensemble/_gb.py | 8 +- .../gradient_boosting.py | 4 +- sklearn/ensemble/_iforest.py | 3 +- sklearn/ensemble/_stacking.py | 8 +- sklearn/ensemble/_voting.py | 11 +- sklearn/ensemble/_weight_boosting.py | 8 +- sklearn/ensemble/tests/test_forest.py | 51 ++ .../feature_extraction/_dict_vectorizer.py | 5 +- sklearn/feature_extraction/_hash.py | 4 +- sklearn/feature_extraction/image.py | 3 +- sklearn/feature_extraction/text.py | 14 +- sklearn/feature_selection/_from_model.py | 11 +- sklearn/feature_selection/_rfe.py | 11 +- sklearn/feature_selection/_sequential.py | 7 +- .../_univariate_selection.py | 4 +- .../feature_selection/_variance_threshold.py | 3 +- sklearn/gaussian_process/_gpc.py | 4 +- sklearn/gaussian_process/_gpr.py | 4 +- sklearn/impute/_base.py | 8 +- sklearn/impute/_iterative.py | 8 +- sklearn/impute/_knn.py | 3 +- sklearn/isotonic.py | 3 +- sklearn/kernel_approximation.py | 13 +- sklearn/kernel_ridge.py | 4 +- sklearn/linear_model/_base.py | 5 +- sklearn/linear_model/_bayes.py | 8 +- sklearn/linear_model/_coordinate_descent.py | 11 +- sklearn/linear_model/_glm/glm.py | 4 +- sklearn/linear_model/_huber.py | 3 +- sklearn/linear_model/_least_angle.py | 10 +- sklearn/linear_model/_logistic.py | 9 +- sklearn/linear_model/_omp.py | 7 +- sklearn/linear_model/_passive_aggressive.py | 9 +- sklearn/linear_model/_quantile.py | 3 +- sklearn/linear_model/_ransac.py | 7 +- sklearn/linear_model/_ridge.py | 13 +- sklearn/linear_model/_stochastic_gradient.py | 13 +- sklearn/linear_model/_theil_sen.py | 3 +- sklearn/manifold/_isomap.py | 13 +- sklearn/manifold/_locally_linear.py | 5 +- sklearn/manifold/_mds.py | 4 +- sklearn/manifold/_spectral_embedding.py | 4 +- sklearn/manifold/_t_sne.py | 11 +- sklearn/metrics/pairwise.py | 14 +- sklearn/mixture/_base.py | 4 +- sklearn/model_selection/__init__.py | 2 + sklearn/model_selection/_plot.py | 680 +++++++++++++++--- sklearn/model_selection/_search.py | 6 +- .../_search_successive_halving.py | 6 +- sklearn/model_selection/tests/test_plot.py | 337 +++++++-- sklearn/multiclass.py | 29 +- sklearn/multioutput.py | 26 +- sklearn/naive_bayes.py | 12 +- sklearn/neighbors/_classification.py | 12 +- sklearn/neighbors/_graph.py | 11 +- sklearn/neighbors/_kde.py | 7 +- sklearn/neighbors/_lof.py | 7 +- sklearn/neighbors/_nca.py | 4 +- sklearn/neighbors/_nearest_centroid.py | 4 +- sklearn/neighbors/_regression.py | 12 +- sklearn/neighbors/_unsupervised.py | 6 +- .../neural_network/_multilayer_perceptron.py | 12 +- sklearn/neural_network/_rbm.py | 9 +- sklearn/pipeline.py | 16 +- sklearn/preprocessing/_data.py | 57 +- sklearn/preprocessing/_discretization.py | 3 +- sklearn/preprocessing/_encoders.py | 7 +- .../preprocessing/_function_transformer.py | 3 +- sklearn/preprocessing/_label.py | 10 +- sklearn/preprocessing/_polynomial.py | 6 +- sklearn/preprocessing/_target_encoder.py | 5 +- sklearn/preprocessing/tests/test_data.py | 19 + sklearn/random_projection.py | 4 +- sklearn/semi_supervised/_label_propagation.py | 3 +- sklearn/semi_supervised/_self_training.py | 7 +- sklearn/svm/_base.py | 4 +- sklearn/svm/_classes.py | 7 +- sklearn/tests/test_metadata_routing.py | 15 + sklearn/tests/test_public_functions.py | 1 + sklearn/tree/_classes.py | 185 ++++- sklearn/tree/_criterion.pxd | 2 +- sklearn/tree/_criterion.pyx | 15 +- sklearn/tree/_splitter.pxd | 4 +- sklearn/tree/_splitter.pyx | 46 +- sklearn/tree/_tree.pxd | 20 +- sklearn/tree/_tree.pyx | 58 +- sklearn/tree/tests/test_tree.py | 175 ++++- sklearn/utils/_metadata_requests.py | 12 +- sklearn/utils/_plotting.py | 40 ++ sklearn/utils/estimator_checks.py | 19 +- sklearn/utils/tests/test_param_validation.py | 4 +- sklearn/utils/tests/test_plotting.py | 63 ++ sklearn/utils/tests/test_validation.py | 10 + sklearn/utils/validation.py | 51 +- 141 files changed, 2511 insertions(+), 797 deletions(-) create mode 100644 sklearn/utils/tests/test_plotting.py diff --git a/doc/authors_emeritus.rst b/doc/authors_emeritus.rst index b979b77bba974..a56e2bc408ff4 100644 --- a/doc/authors_emeritus.rst +++ b/doc/authors_emeritus.rst @@ -20,6 +20,7 @@ - Wei Li - Paolo Losi - Gilles Louppe +- Chiara Marmo - Vincent Michel - Jarrod Millman - Alexandre Passos diff --git a/doc/contributor_experience_team.rst b/doc/contributor_experience_team.rst index 2e09d9069849a..00b658632302e 100644 --- a/doc/contributor_experience_team.rst +++ b/doc/contributor_experience_team.rst @@ -18,6 +18,10 @@

Lucy Liu

+
+

Maxwell Liu

+
+

Juan Martin Loyola

@@ -26,14 +30,6 @@

Sylvain Marié

-
-

Chiara Marmo

-
-
-
-

Maxwell Liu

-
-

Norbert Preining

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 4961fb0fec366..204c300b1a9b8 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1247,6 +1247,7 @@ Visualization :template: display_only_from_estimator.rst model_selection.LearningCurveDisplay + model_selection.ValidationCurveDisplay .. _multiclass_ref: diff --git a/doc/modules/learning_curve.rst b/doc/modules/learning_curve.rst index 0ce64063d4cd9..3d458a1a67416 100644 --- a/doc/modules/learning_curve.rst +++ b/doc/modules/learning_curve.rst @@ -71,7 +71,7 @@ The function :func:`validation_curve` can help in this case:: >>> import numpy as np >>> from sklearn.model_selection import validation_curve >>> from sklearn.datasets import load_iris - >>> from sklearn.linear_model import Ridge + >>> from sklearn.svm import SVC >>> np.random.seed(0) >>> X, y = load_iris(return_X_y=True) @@ -80,30 +80,50 @@ The function :func:`validation_curve` can help in this case:: >>> X, y = X[indices], y[indices] >>> train_scores, valid_scores = validation_curve( - ... Ridge(), X, y, param_name="alpha", param_range=np.logspace(-7, 3, 3), - ... cv=5) + ... SVC(kernel="linear"), X, y, param_name="C", param_range=np.logspace(-7, 3, 3), + ... ) >>> train_scores - array([[0.93..., 0.94..., 0.92..., 0.91..., 0.92...], - [0.93..., 0.94..., 0.92..., 0.91..., 0.92...], - [0.51..., 0.52..., 0.49..., 0.47..., 0.49...]]) + array([[0.90..., 0.94..., 0.91..., 0.89..., 0.92...], + [0.9... , 0.92..., 0.93..., 0.92..., 0.93...], + [0.97..., 1... , 0.98..., 0.97..., 0.99...]]) >>> valid_scores - array([[0.90..., 0.84..., 0.94..., 0.96..., 0.93...], - [0.90..., 0.84..., 0.94..., 0.96..., 0.93...], - [0.46..., 0.25..., 0.50..., 0.49..., 0.52...]]) + array([[0.9..., 0.9... , 0.9... , 0.96..., 0.9... ], + [0.9..., 0.83..., 0.96..., 0.96..., 0.93...], + [1.... , 0.93..., 1.... , 1.... , 0.9... ]]) + +If you intend to plot the validation curves only, the class +:class:`~sklearn.model_selection.ValidationCurveDisplay` is more direct than +using matplotlib manually on the results of a call to :func:`validation_curve`. +You can use the method +:meth:`~sklearn.model_selection.ValidationCurveDisplay.from_estimator` similarly +to :func:`validation_curve` to generate and plot the validation curve: + +.. plot:: + :context: close-figs + :align: center + + from sklearn.datasets import load_iris + from sklearn.model_selection import ValidationCurveDisplay + from sklearn.svm import SVC + from sklearn.utils import shuffle + X, y = load_iris(return_X_y=True) + X, y = shuffle(X, y, random_state=0) + ValidationCurveDisplay.from_estimator( + SVC(kernel="linear"), X, y, param_name="C", param_range=np.logspace(-7, 3, 10) + ) If the training score and the validation score are both low, the estimator will be underfitting. If the training score is high and the validation score is low, the estimator is overfitting and otherwise it is working very well. A low training score and a high validation score is usually not possible. Underfitting, overfitting, and a working model are shown in the in the plot below where we vary -the parameter :math:`\gamma` of an SVM on the digits dataset. +the parameter `gamma` of an SVM with an RBF kernel on the digits dataset. .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_validation_curve_001.png :target: ../auto_examples/model_selection/plot_validation_curve.html :align: center :scale: 50% - .. _learning_curve: Learning curve diff --git a/doc/visualizations.rst b/doc/visualizations.rst index f692fd8efd1df..9a44f6feb1b48 100644 --- a/doc/visualizations.rst +++ b/doc/visualizations.rst @@ -89,3 +89,4 @@ Display Objects metrics.PredictionErrorDisplay metrics.RocCurveDisplay model_selection.LearningCurveDisplay + model_selection.ValidationCurveDisplay diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index bb35a1db224b4..41c03293cf067 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -29,11 +29,6 @@ random sampling procedures. `transform_algorithm` is not the same as `fit_algorithm` and the number of iterations is small. :pr:`24871` by :user:`Omar Salman `. -- |Fix| Treat more consistently small values in the `W` and `H` matrices during the - `fit` and `transform` steps of :class:`decomposition.NMF` and - :class:`decomposition.MiniBatchNMF` which can produce different results than previous - versions. :pr:`25438` by :user:`Yotam Avidar-Constantini `. - - |Enhancement| The `sample_weight` parameter now will be used in centroids initialization for :class:`cluster.KMeans`, :class:`cluster.BisectingKMeans` and :class:`cluster.MiniBatchKMeans`. @@ -43,6 +38,11 @@ random sampling procedures. :user:`Jérémie du Boisberranger `, :user:`Guillaume Lemaitre `. +- |Fix| Treat more consistently small values in the `W` and `H` matrices during the + `fit` and `transform` steps of :class:`decomposition.NMF` and + :class:`decomposition.MiniBatchNMF` which can produce different results than previous + versions. :pr:`25438` by :user:`Yotam Avidar-Constantini `. + - |Fix| :class:`decomposition.KernelPCA` may produce different results through `inverse_transform` if `gamma` is `None`. Now it will be chosen correctly as `1/n_features` of the data that it is fitted on, while previously it might be @@ -51,6 +51,14 @@ random sampling procedures. used each time the kernel is called. :pr:`26337` by :user:`Yao Xiao `. +Changed displays +---------------- + +- |Enhancement| :class:`model_selection.LearningCurveDisplay` displays both the + train and test curves by default. You can set `score_type="test"` to keep the + past behaviour. + :pr:`25120` by :user:`Guillaume Lemaitre `. + Changes impacting all modules ----------------------------- @@ -201,23 +209,9 @@ Changelog :mod:`sklearn.cluster` ...................... -- |API| The `sample_weight` parameter in `predict` for - :meth:`cluster.KMeans.predict` and :meth:`cluster.MiniBatchKMeans.predict` - is now deprecated and will be removed in v1.5. - :pr:`25251` by :user:`Gleb Levitski `. - -- |Enhancement| The `sample_weight` parameter now will be used in centroids - initialization for :class:`cluster.KMeans`, :class:`cluster.BisectingKMeans` - and :class:`cluster.MiniBatchKMeans`. - This change will break backward compatibility, since numbers generated - from same random seeds will be different. - :pr:`25752` by :user:`Gleb Levitski `, - :user:`Jérémie du Boisberranger `, - :user:`Guillaume Lemaitre `. - - |MajorFeature| Added :class:`cluster.HDBSCAN`, a modern hierarchical density-based clustering algorithm. Similarly to :class:`cluster.OPTICS`, it can be seen as a - generalization of :class:`DBSCAN` by allowing for hierarchical instead of flat + generalization of :class:`cluster.DBSCAN` by allowing for hierarchical instead of flat clustering, however it varies in its approach from :class:`cluster.OPTICS`. This algorithm is very robust with respect to its hyperparameters' values and can be used on a wide variety of data without much, if any, tuning. @@ -228,12 +222,30 @@ Changelog :pr:`26385` by :user:`Meekail Zain ` +- |Enhancement| The `sample_weight` parameter now will be used in centroids + initialization for :class:`cluster.KMeans`, :class:`cluster.BisectingKMeans` + and :class:`cluster.MiniBatchKMeans`. + This change will break backward compatibility, since numbers generated + from same random seeds will be different. + :pr:`25752` by :user:`Gleb Levitski `, + :user:`Jérémie du Boisberranger `, + :user:`Guillaume Lemaitre `. + +- |API| The `sample_weight` parameter in `predict` for + :meth:`cluster.KMeans.predict` and :meth:`cluster.MiniBatchKMeans.predict` + is now deprecated and will be removed in v1.5. + :pr:`25251` by :user:`Gleb Levitski `. + +- |API| The `Xred` argument in :func:`cluster.FeatureAgglomeration.inverse_transform` + is renamed to `Xt` and will be removed in v1.5. :pr:`26503` by `Adrin Jalali`_. + :mod:`sklearn.compose` ...................... -- |Fix| `compose.ColumnTransformer` raises an informative error when the individual transformers of `ColumnTransformer` - output pandas dataframes with indexes that are not consistent with each other and the output is configured - to be pandas. :pr:`26286` by `Thomas Fan`_. +- |Fix| `compose.ColumnTransformer` raises an informative error when the individual + transformers of `ColumnTransformer` output pandas dataframes with indexes that are + not consistent with each other and the output is configured to be pandas. + :pr:`26286` by `Thomas Fan`_. - |Fix| :class:`compose.ColumnTransformer` correctly sets the output of the remainder when `set_output` is called. :pr:`26323` by `Thomas Fan`_. @@ -241,6 +253,14 @@ Changelog :mod:`sklearn.covariance` ......................... +- |Fix| Allows `alpha=0` in :class:`covariance.GraphicalLasso` to be + consistent with :func:`covariance.graphical_lasso`. + :pr:`26033` by :user:`Genesis Valencia `. + +- |Fix| :func:`covariance.empirical_covariance` now gives an informative + error message when input is not appropriate. + :pr:`26108` by :user:`Quentin Barthélemy `. + - |API| Deprecates `cov_init` in :func:`covariance.graphical_lasso` in 1.3 since the parameter has no effect. It will be removed in 1.5. :pr:`26033` by :user:`Genesis Valencia `. @@ -256,20 +276,13 @@ Changelog :func:`covariance.graphical_lasso_path`, and :class:`covariance.GraphicalLassoCV`. :pr:`26033` by :user:`Genesis Valencia `. -- |Fix| Allows `alpha=0` in :class:`covariance.GraphicalLasso` to be - consistent with :func:`covariance.graphical_lasso`. - :pr:`26033` by :user:`Genesis Valencia `. - -- |Fix| :func:`covariance.empirical_covariance` now gives an informative - error message when input is not appropriate. - :pr:`26108` by :user:`Quentin Barthélemy `. - :mod:`sklearn.datasets` ....................... -- |API| The `data_transposed` argument of :func:`datasets.make_sparse_coded_signal` - is deprecated and will be removed in v1.5. - :pr:`25784` by :user:`Jérémie du Boisberranger`. +- |Enhancement| Allows to overwrite the parameters used to open the ARFF file using + the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the + pandas parser. + :pr:`26433` by :user:`Guillaume Lemaitre `. - |Fix| :func:`datasets.fetch_openml` returns improved data types when `as_frame=True` and `parser="liac-arff"`. :pr:`26386` by `Thomas Fan`_. @@ -279,28 +292,35 @@ Changelog the pandas parser. The parameter `read_csv_kwargs` allows to overwrite this behaviour. :pr:`26551` by :user:`Guillaume Lemaitre `. -- |Enhancement| Allows to overwrite the parameters used to open the ARFF file using - the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the - pandas parser. - :pr:`26433` by :user:`Guillaume Lemaitre `. +- |Fix| :func:`dataasets.fetch_openml` will consistenly use `np.nan` as missing marker + with both parsers `"pandas"` and `"liac-arff"`. + :pr:`26579` by :user:`Guillaume Lemaitre `. + +- |API| The `data_transposed` argument of :func:`datasets.make_sparse_coded_signal` + is deprecated and will be removed in v1.5. + :pr:`25784` by :user:`Jérémie du Boisberranger`. :mod:`sklearn.decomposition` ............................ -- |Enhancement| :class:`decomposition.DictionaryLearning` now accepts the parameter - `callback` for consistency with the function :func:`decomposition.dict_learning`. - :pr:`24871` by :user:`Omar Salman `. - - |Efficiency| :class:`decomposition.MiniBatchDictionaryLearning` and :class:`decomposition.MiniBatchSparsePCA` are now faster for small batch sizes by avoiding duplicate validations. :pr:`25490` by :user:`Jérémie du Boisberranger `. +- |Enhancement| :class:`decomposition.DictionaryLearning` now accepts the parameter + `callback` for consistency with the function :func:`decomposition.dict_learning`. + :pr:`24871` by :user:`Omar Salman `. + - |Fix| Treat more consistently small values in the `W` and `H` matrices during the `fit` and `transform` steps of :class:`decomposition.NMF` and :class:`decomposition.MiniBatchNMF` which can produce different results than previous versions. :pr:`25438` by :user:`Yotam Avidar-Constantini `. +- |API| The `W` argument in :func:`decomposition.NMF.inverse_transform` and + :class:`decomposition.MiniBatchNMF.inverse_transform` is renamed to `Xt` and + will be removed in v1.5. :pr:`26503` by `Adrin Jalali`_. + :mod:`sklearn.discriminant_analysis` .................................... @@ -364,6 +384,7 @@ Changelog :mod:`sklearn.exception` ........................ + - |Feature| Added :class:`exception.InconsistentVersionWarning` which is raised when a scikit-learn estimator is unpickled with a scikit-learn version that is inconsistent with the sckit-learn version the estimator was pickled with. @@ -393,6 +414,9 @@ Changelog - |Enhancement| Added the parameter `fill_value` to :class:`impute.IterativeImputer`. :pr:`25232` by :user:`Thijs van Weezel `. +- |Fix| :class:`impute.IterativeImputer` now correctly preserves the Pandas + Index when the `set_config(transform_output="pandas")`. :pr:`26454` by `Thomas Fan`_. + :mod:`sklearn.inspection` ......................... @@ -420,12 +444,6 @@ Changelog now preserve dtype for `numpy.float32`. :pr:`25587` by :user:`Omar Salman `. -- |API| Deprecates `n_iter` in favor of `max_iter` in - :class:`linear_model.BayesianRidge` and :class:`linear_model.ARDRegression`. - `n_iter` will be removed in scikit-learn 1.5. This change makes those - estimators consistent with the rest of estimators. - :pr:`25697` by :user:`John Pangas `. - - |Enhancement| The `n_iter_` attribute has been included in :class:`linear_model.ARDRegression` to expose the actual number of iterations required to reach the stopping criterion. @@ -436,36 +454,41 @@ Changelog on linearly separable problems. :pr:`25214` by `Tom Dupre la Tour`_. +- |API| Deprecates `n_iter` in favor of `max_iter` in + :class:`linear_model.BayesianRidge` and :class:`linear_model.ARDRegression`. + `n_iter` will be removed in scikit-learn 1.5. This change makes those + estimators consistent with the rest of estimators. + :pr:`25697` by :user:`John Pangas `. + +:mod:`sklearn.manifold` +....................... + +- |Fix| :class:`manifold.Isomap` now correctly preserves the Pandas + Index when the `set_config(transform_output="pandas")`. :pr:`26454` by `Thomas Fan`_. + :mod:`sklearn.metrics` ...................... -- |Efficiency| The computation of the expected mutual information in - :func:`metrics.adjusted_mutual_info_score` is now faster when the number of - unique labels is large and its memory usage is reduced in general. - :pr:`25713` by :user:`Kshitij Mathur `, - :user:`Guillaume Lemaitre `, :user:`Omar Salman ` and - :user:`Jérémie du Boisberranger `. - - |Feature| Adds `zero_division=np.nan` to multiple classification metrics: - :func:`precision_score`, :func:`recall_score`, :func:`f1_score`, - :func:`fbeta_score`, :func:`precision_recall_fscore_support`, - :func:`classification_report`. When `zero_division=np.nan` and there is a + :func:`metrics.precision_score`, :func:`metrics.recall_score`, + :func:`metrics.f1_score`, :func:`metrics.fbeta_score`, + :func:`metrics.precision_recall_fscore_support`, + :func:`metrics.classification_report`. When `zero_division=np.nan` and there is a zero division, the metric is undefined and is excluded from averaging. When not used for averages, the value returned is `np.nan`. :pr:`25531` by :user:`Marc Torrellas Socastro `. -- |Fix| :func:`metric.manhattan_distances` now supports readonly sparse datasets. - :pr:`25432` by :user:`Julien Jerphanion `. - -- |Fix| Fixed :func:`classification_report` so that empty input will return - `np.nan`. Previously, "macro avg" and `weighted avg` would return - e.g. `f1-score=np.nan` and `f1-score=0.0`, being inconsistent. Now, they - both return `np.nan`. - :pr:`25531` by :user:`Marc Torrellas Socastro `. +- |Feature| :func:`metrics.average_precision_score` now supports the + multiclass case. + :pr:`17388` by :user:`Geoffrey Bolmier ` and + :pr:`24769` by :user:`Ashwin Mathur `. -- |Fix| :func:`metric.ndcg_score` now gives a meaningful error message for input of - length 1. - :pr:`25672` by :user:`Lene Preuss ` and :user:`Wei-Chun Chu `. +- |Efficiency| The computation of the expected mutual information in + :func:`metrics.adjusted_mutual_info_score` is now faster when the number of + unique labels is large and its memory usage is reduced in general. + :pr:`25713` by :user:`Kshitij Mathur `, + :user:`Guillaume Lemaitre `, :user:`Omar Salman ` and + :user:`Jérémie du Boisberranger `. - |Enhancement| :class:`metrics.silhouette_samples` nows accepts a sparse matrix of pairwise distances between samples, or a feature array. @@ -492,17 +515,23 @@ Changelog chance level. This line is exposed in the `chance_level_` attribute. :pr:`26019` by :user:`Yao Xiao `. -- |Fix| :func:`log_loss` raises a warning if the values of the parameter `y_pred` are - not normalized, instead of actually normalizing them in the metric. Starting from - 1.5 this will raise an error. :pr:`25299` by :user:`Omar Salman `. + +- |Fix| Fixed :func:`metrics.classification_report` so that empty input will return + `np.nan`. Previously, "macro avg" and `weighted avg` would return + e.g. `f1-score=np.nan` and `f1-score=0.0`, being inconsistent. Now, they + both return `np.nan`. + :pr:`25531` by :user:`Marc Torrellas Socastro `. -- |API| The `eps` parameter of the :func:`log_loss` has been deprecated and will be - removed in 1.5. :pr:`25299` by :user:`Omar Salman `. +- |Fix| :func:`metrics.ndcg_score` now gives a meaningful error message for input of + length 1. + :pr:`25672` by :user:`Lene Preuss ` and :user:`Wei-Chun Chu `. -- |Feature| :func:`metrics.average_precision_score` now supports the - multiclass case. - :pr:`17388` by :user:`Geoffrey Bolmier ` and - :pr:`24769` by :user:`Ashwin Mathur `. +- |Fix| :func:`metrics.log_loss` raises a warning if the values of the parameter + `y_pred` are not normalized, instead of actually normalizing them in the metric. + Starting from 1.5 this will raise an error. + :pr:`25299` by :user:`Omar Salman ` +- |API| The `eps` parameter of the :func:`metrics.log_loss` has been deprecated and + will be removed in 1.5. :pr:`25299` by :user:`Omar Salman `. + :mod:`sklearn.gaussian_process` ............................... @@ -524,6 +556,18 @@ Changelog :mod:`sklearn.model_selection` .............................. +- |MajorFeature| Added the class :class:`model_selection.ValidationCurveDisplay` + that allows easy plotting of validation curves obtained by the function + :func:`model_selection.validation_curve`. + :pr:`25120` by :user:`Guillaume Lemaitre `. + +- |API| The parameter `log_scale` in the class + :class:`model_selection.LearningCurveDisplay` has been deprecated in 1.3 and + will be removed in 1.5. The default scale can be overriden by setting it + directly on the `ax` object and will be set automatically from the spacing + of the data points otherwise. + :pr:`25120` by :user:`Guillaume Lemaitre `. + - |Enhancement| :func:`model_selection.cross_validate` accepts a new parameter `return_indices` to return the train-test indices of each cv split. :pr:`25659` by :user:`Guillaume Lemaitre `. @@ -546,15 +590,15 @@ Changelog :mod:`sklearn.neighbors` ........................ -- |Fix| Remove support for `KulsinskiDistance` in :class:`neighbors.BallTree`. This - dissimilarity is not a metric and cannot be supported by the BallTree. - :pr:`25417` by :user:`Guillaume Lemaitre `. - - |Enhancement| The performance of :meth:`neighbors.KNeighborsClassifier.predict` and of :meth:`neighbors.KNeighborsClassifier.predict_proba` has been improved when `n_neighbors` is large and `algorithm="brute"` with non Euclidean metrics. :pr:`24076` by :user:`Meekail Zain `, :user:`Julien Jerphanion `. +- |Fix| Remove support for `KulsinskiDistance` in :class:`neighbors.BallTree`. This + dissimilarity is not a metric and cannot be supported by the BallTree. + :pr:`25417` by :user:`Guillaume Lemaitre `. + - |API| The support for metrics other than `euclidean` and `manhattan` and for callables in :class:`neighbors.NearestNeighbors` is deprecated and will be removed in version 1.5. :pr:`24083` by :user:`Valentin Laurent `. @@ -592,10 +636,24 @@ Changelog categorical encoding based on target mean conditioned on the value of the category. :pr:`25334` by `Thomas Fan`_. +- |Feature| :class:`preprocessing.OrdinalEncoder` now supports grouping + infrequent categories into a single feature. Grouping infrequent categories + is enabled by specifying how to select infrequent categories with + `min_frequency` or `max_categories`. :pr:`25677` by `Thomas Fan`_. + +- |Enhancement| :class:`preprocessing.PolynomialFeatures` now calculates the + number of expanded terms a-priori when dealing with sparse `csr` matrices + in order to optimize the choice of `dtype` for `indices` and `indptr`. It + can now output `csr` matrices with `np.int32` `indices/indptr` components + when there are few enough elements, and will automatically use `np.int64` + for sufficiently large matrices. + :pr:`20524` by :user:`niuk-a ` and + :pr:`23731` by :user:`Meekail Zain ` + - |Enhancement| A new parameter `sparse_output` was added to - :class:`SplineTransformer`, available as of SciPy 1.8. If `sparse_output=True`, - :class:`SplineTransformer` returns a sparse CSR matrix. - :pr:`24145` by :user:`Christian Lorentzen `. + :class:`preprocessing.SplineTransformer`, available as of SciPy 1.8. If + `sparse_output=True`, :class:`preprocessing.SplineTransformer` returns a sparse + CSR matrix. :pr:`24145` by :user:`Christian Lorentzen `. - |Enhancement| Adds a `feature_name_combiner` parameter to :class:`preprocessing.OneHotEncoder`. This specifies a custom callable to create @@ -610,28 +668,35 @@ Changelog :pr:`24935` by :user:`Seladus `, :user:`Guillaume Lemaitre `, and :user:`Dea María Léon `, :pr:`25257` by :user:`Gleb Levitski `. -- |Feature| :class:`preprocessing.OrdinalEncoder` now supports grouping - infrequent categories into a single feature. Grouping infrequent categories - is enabled by specifying how to select infrequent categories with - `min_frequency` or `max_categories`. :pr:`25677` by `Thomas Fan`_. - - |Enhancement| Subsampling through the `subsample` parameter can now be used in :class:`preprocessing.KBinsDiscretizer` regardless of the strategy used. :pr:`26424` by :user:`Jérémie du Boisberranger `. -- |API| The default value of the `subsample` parameter of - :class:`preprocessing.KBinsDiscretizer` will change from `None` to `200_000` in - version 1.5 when `strategy="kmeans"` or `strategy="uniform"`. - :pr:`26424` by :user:`Jérémie du Boisberranger `. +- |Fix| :class:`preprocessing.AdditiveChi2Sampler` is now stateless. + The `sample_interval_` attribute is deprecated and will be removed in 1.5. + :pr:`25190` by :user:`Vincent Maladière `. - |Fix| :class:`AdditiveChi2Sampler` is now stateless. The `sample_interval_` attribute is deprecated and will be removed in 1.5. :pr:`25190` by :user:`Vincent Maladière `. +- |Fix| :class:`preprocessing.PowerTransformer` now correctly preserves the Pandas + Index when the `set_config(transform_output="pandas")`. :pr:`26454` by `Thomas Fan`_. + - |Fix| :class:`preprocessing.PowerTransformer` now correcly raises error when using `method="box-cox"` on data with a constant `np.nan` column. :pr:`26400` by :user:`Yao Xiao `. +- |Fix| :class:`preprocessing.PowerTransformer` with `method="yeo-johnson"` now leaves + constant features unchanged instead of transforming with an arbitrary value for + the `lambdas_` fitted parameter. + :pr:`26566` by :user:`Jérémie du Boisberranger `. + +- |API| The default value of the `subsample` parameter of + :class:`preprocessing.KBinsDiscretizer` will change from `None` to `200_000` in + version 1.5 when `strategy="kmeans"` or `strategy="uniform"`. + :pr:`26424` by :user:`Jérémie du Boisberranger `. + :mod:`sklearn.svm` .................. @@ -660,45 +725,36 @@ Changelog :mod:`sklearn.utils` .................... -- |API| :func:`estimator_checks.check_transformers_unfitted_stateless` has been +- |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas + extension arrays. :pr:`25813` and :pr:`26106` by `Thomas Fan`_. + +- |Fix| :func:`utils.validation.check_array` now supports pandas DataFrames with + extension arrays and object dtypes by return an ndarray with object dtype. + :pr:`25814` by `Thomas Fan`_. + +- |API| :func:`utils.estimator_checks.check_transformers_unfitted_stateless` has been introduced to ensure stateless transformers don't raise `NotFittedError` during `transform` with no prior call to `fit` or `fit_transform`. :pr:`25190` by :user:`Vincent Maladière `. -- |Enhancement| :class:`preprocessing.PolynomialFeatures` now calculates the - number of expanded terms a-priori when dealing with sparse `csr` matrices - in order to optimize the choice of `dtype` for `indices` and `indptr`. It - can now output `csr` matrices with `np.int32` `indices/indptr` components - when there are few enough elements, and will automatically use `np.int64` - for sufficiently large matrices. - :pr:`20524` by :user:`niuk-a ` and - :pr:`23731` by :user:`Meekail Zain ` - - |API| A `FutureWarning` is now raised when instantiating a class which inherits from a deprecated base class (i.e. decorated by :class:`utils.deprecated`) and which overrides the `__init__` method. :pr:`25733` by :user:`Brigitta Sipőcz ` and :user:`Jérémie du Boisberranger `. -- |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas - extension arrays. :pr:`25813` and :pr:`26106` by `Thomas Fan`_. - -- |Fix| :func:`utils.validation.check_array` now supports pandas DataFrames with - extension arrays and object dtypes by return an ndarray with object dtype. - :pr:`25814` by `Thomas Fan`_. - :mod:`sklearn.semi_supervised` .............................. -- |Enhancement| :meth:`LabelSpreading.fit` and :meth:`LabelPropagation.fit` now - accepts sparse metrics. +- |Enhancement| :meth:`semi_supervised.LabelSpreading.fit` and + :meth:`semi_supervised.LabelPropagation.fit` now accepts sparse metrics. :pr:`19664` by :user:`Kaushik Amar Das `. Miscellaneous ............. -- |Enhancement| Replace obsolete exceptions EnvironmentError, IOError and - WindowsError. +- |Enhancement| Replace obsolete exceptions `EnvironmentError`, `IOError` and + `WindowsError`. :pr:`26466` by :user:`Dimitri Papadopoulos ORfanos `. Code and Documentation Contributors diff --git a/examples/miscellaneous/plot_kernel_ridge_regression.py b/examples/miscellaneous/plot_kernel_ridge_regression.py index 20b8496ab18aa..fa7cb15446473 100644 --- a/examples/miscellaneous/plot_kernel_ridge_regression.py +++ b/examples/miscellaneous/plot_kernel_ridge_regression.py @@ -203,6 +203,7 @@ "scoring": "neg_mean_squared_error", "negate_score": True, "score_name": "Mean Squared Error", + "score_type": "test", "std_display_style": None, "ax": ax, } diff --git a/examples/model_selection/plot_validation_curve.py b/examples/model_selection/plot_validation_curve.py index 1b3c562594188..48aa19dfbc556 100644 --- a/examples/model_selection/plot_validation_curve.py +++ b/examples/model_selection/plot_validation_curve.py @@ -18,53 +18,23 @@ from sklearn.datasets import load_digits from sklearn.svm import SVC -from sklearn.model_selection import validation_curve +from sklearn.model_selection import ValidationCurveDisplay X, y = load_digits(return_X_y=True) subset_mask = np.isin(y, [1, 2]) # binary classification: 1 vs 2 X, y = X[subset_mask], y[subset_mask] -param_range = np.logspace(-6, -1, 5) -train_scores, test_scores = validation_curve( +disp = ValidationCurveDisplay.from_estimator( SVC(), X, y, param_name="gamma", - param_range=param_range, - scoring="accuracy", + param_range=np.logspace(-6, -1, 5), + score_type="both", n_jobs=2, + score_name="Accuracy", ) -train_scores_mean = np.mean(train_scores, axis=1) -train_scores_std = np.std(train_scores, axis=1) -test_scores_mean = np.mean(test_scores, axis=1) -test_scores_std = np.std(test_scores, axis=1) - -plt.title("Validation Curve with SVM") -plt.xlabel(r"$\gamma$") -plt.ylabel("Score") -plt.ylim(0.0, 1.1) -lw = 2 -plt.semilogx( - param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw -) -plt.fill_between( - param_range, - train_scores_mean - train_scores_std, - train_scores_mean + train_scores_std, - alpha=0.2, - color="darkorange", - lw=lw, -) -plt.semilogx( - param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw -) -plt.fill_between( - param_range, - test_scores_mean - test_scores_std, - test_scores_mean + test_scores_std, - alpha=0.2, - color="navy", - lw=lw, -) -plt.legend(loc="best") +disp.ax_.set_title("Validation Curve for SVM with an RBF kernel") +disp.ax_.set_xlabel(r"gamma (inverse radius of the RBF kernel)") +disp.ax_.set_ylim(0.0, 1.1) plt.show() diff --git a/sklearn/base.py b/sklearn/base.py index 5cced34d4b8f0..13bbcab96aa61 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -27,7 +27,7 @@ from .utils.validation import _num_features from .utils.validation import _check_feature_names_in from .utils.validation import _generate_get_feature_names_out -from .utils.validation import check_is_fitted +from .utils.validation import _is_fitted, check_is_fitted from .utils._metadata_requests import _MetadataRequester from .utils.validation import _get_feature_names from .utils._estimator_html_repr import estimator_html_repr @@ -1131,7 +1131,13 @@ def decorator(fit_method): @functools.wraps(fit_method) def wrapper(estimator, *args, **kwargs): global_skip_validation = get_config()["skip_parameter_validation"] - if not global_skip_validation: + + # we don't want to validate again for each call to partial_fit + partial_fit_and_fitted = ( + fit_method.__name__ == "partial_fit" and _is_fitted(estimator) + ) + + if not global_skip_validation and not partial_fit_and_fitted: estimator._validate_params() with config_context( diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 5e7bfe2ab4a31..e4869387f4166 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -25,6 +25,7 @@ RegressorMixin, clone, MetaEstimatorMixin, + _fit_context, ) from .preprocessing import label_binarize, LabelEncoder from .utils import ( @@ -318,6 +319,10 @@ def _get_estimator(self): return estimator + @_fit_context( + # CalibratedClassifierCV.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None, **fit_params): """Fit the calibrated model. @@ -341,8 +346,6 @@ def fit(self, X, y, sample_weight=None, **fit_params): self : object Returns an instance of self. """ - self._validate_params() - check_classification_targets(y) X, y = indexable(X, y) if sample_weight is not None: diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 8a3c2c2acde62..1ffc5f07e8c50 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -12,6 +12,7 @@ from ..exceptions import ConvergenceWarning from ..base import BaseEstimator, ClusterMixin +from ..base import _fit_context from ..utils import check_random_state from ..utils._param_validation import Interval, StrOptions, validate_params from ..utils.validation import check_is_fitted @@ -469,6 +470,7 @@ def __init__( def _more_tags(self): return {"pairwise": self.affinity == "precomputed"} + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the clustering from features, or affinity matrix. @@ -488,8 +490,6 @@ def fit(self, X, y=None): self Returns the instance itself. """ - self._validate_params() - if self.affinity == "precomputed": accept_sparse = False else: diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 059056275ef3d..b7d08a45dcd80 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -16,6 +16,7 @@ from scipy.sparse.csgraph import connected_components from ..base import BaseEstimator, ClusterMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..metrics.pairwise import paired_distances from ..metrics.pairwise import _VALID_METRICS from ..metrics import DistanceMetric @@ -950,6 +951,7 @@ def __init__( self.metric = metric self.compute_distances = compute_distances + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the hierarchical clustering from features, or distance matrix. @@ -968,7 +970,6 @@ def fit(self, X, y=None): self : object Returns the fitted instance. """ - self._validate_params() X = self._validate_data(X, ensure_min_samples=2) return self._fit(X) @@ -1324,6 +1325,7 @@ def __init__( ) self.pooling_func = pooling_func + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the hierarchical clustering on the data. @@ -1340,7 +1342,6 @@ def fit(self, X, y=None): self : object Returns the transformer. """ - self._validate_params() X = self._validate_data(X, ensure_min_features=2) super()._fit(X.T) self._n_features_out = self.n_clusters_ diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index ba837bacc99d5..4133264626ebb 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -13,6 +13,7 @@ from . import KMeans, MiniBatchKMeans from ..base import BaseEstimator, BiclusterMixin +from ..base import _fit_context from ..utils import check_random_state from ..utils import check_scalar @@ -118,6 +119,7 @@ def __init__( def _check_parameters(self, n_samples): """Validate parameters depending on the input data.""" + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Create a biclustering for X. @@ -134,8 +136,6 @@ def fit(self, X, y=None): self : object SpectralBiclustering instance. """ - self._validate_params() - X = self._validate_data(X, accept_sparse="csr", dtype=np.float64) self._check_parameters(X.shape[0]) self._fit(X) diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 4c9d7921fdc70..e74630572a014 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -16,6 +16,7 @@ ClusterMixin, BaseEstimator, ClassNamePrefixFeaturesOutMixin, + _fit_context, ) from ..utils.extmath import row_norms from ..utils._param_validation import Interval @@ -501,6 +502,7 @@ def __init__( self.compute_labels = compute_labels self.copy = copy + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """ Build a CF Tree for the input data. @@ -518,9 +520,6 @@ def fit(self, X, y=None): self Fitted estimator. """ - - self._validate_params() - return self._fit(X, partial=False) def _fit(self, X, partial): @@ -610,6 +609,7 @@ def _get_leaves(self): leaf_ptr = leaf_ptr.next_leaf_ return leaves + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X=None, y=None): """ Online learning. Prevents rebuilding of CFTree from scratch. @@ -629,8 +629,6 @@ def partial_fit(self, X=None, y=None): self Fitted estimator. """ - self._validate_params() - if X is None: # Perform just the final global clustering step. self._global_clustering() diff --git a/sklearn/cluster/_bisect_k_means.py b/sklearn/cluster/_bisect_k_means.py index fc2b38cc1bca9..959d78ae85009 100644 --- a/sklearn/cluster/_bisect_k_means.py +++ b/sklearn/cluster/_bisect_k_means.py @@ -6,6 +6,7 @@ import numpy as np import scipy.sparse as sp +from ..base import _fit_context from ._kmeans import _BaseKMeans from ._kmeans import _kmeans_single_elkan from ._kmeans import _kmeans_single_lloyd @@ -347,6 +348,7 @@ def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect): cluster_to_bisect.split(best_labels, best_centers, scores) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None, sample_weight=None): """Compute bisecting k-means clustering. @@ -373,8 +375,6 @@ def fit(self, X, y=None, sample_weight=None): self Fitted estimator. """ - self._validate_params() - X = self._validate_data( X, accept_sparse="csr", diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index aa81ef27702e6..3c753935ac046 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -16,6 +16,7 @@ from ..metrics.pairwise import _VALID_METRICS from ..base import BaseEstimator, ClusterMixin +from ..base import _fit_context from ..utils.validation import _check_sample_weight from ..utils._param_validation import Interval, StrOptions from ..neighbors import NearestNeighbors @@ -338,6 +339,10 @@ def __init__( self.p = p self.n_jobs = n_jobs + @_fit_context( + # DBSCAN.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None, sample_weight=None): """Perform DBSCAN clustering from features, or distance matrix. @@ -363,8 +368,6 @@ def fit(self, X, y=None, sample_weight=None): self : object Returns a fitted instance of self. """ - self._validate_params() - X = self._validate_data(X, accept_sparse="csr") if sample_weight is not None: diff --git a/sklearn/cluster/_feature_agglomeration.py b/sklearn/cluster/_feature_agglomeration.py index 457a83dd41e71..55baf247a2931 100644 --- a/sklearn/cluster/_feature_agglomeration.py +++ b/sklearn/cluster/_feature_agglomeration.py @@ -5,10 +5,12 @@ # Author: V. Michel, A. Gramfort # License: BSD 3 clause +import warnings import numpy as np from ..base import TransformerMixin from ..utils.validation import check_is_fitted +from ..utils import metadata_routing from scipy.sparse import issparse ############################################################################### @@ -20,6 +22,11 @@ class AgglomerationTransform(TransformerMixin): A class for feature agglomeration via the transform interface. """ + # This prevents ``set_split_inverse_transform`` to be generated for the + # non-standard ``Xred`` arg on ``inverse_transform``. + # TODO(1.5): remove when Xred is removed for inverse_transform. + __metadata_request__inverse_transform = {"Xred": metadata_routing.UNUSED} + def transform(self, X): """ Transform a new matrix using the built clustering. @@ -54,22 +61,43 @@ def transform(self, X): nX = np.array(nX).T return nX - def inverse_transform(self, Xred): + def inverse_transform(self, Xt=None, Xred=None): """ Inverse the transformation and return a vector of size `n_features`. Parameters ---------- - Xred : array-like of shape (n_samples, n_clusters) or (n_clusters,) + Xt : array-like of shape (n_samples, n_clusters) or (n_clusters,) The values to be assigned to each cluster of samples. + Xred : deprecated + Use `Xt` instead. + + .. deprecated:: 1.3 + Returns ------- X : ndarray of shape (n_samples, n_features) or (n_features,) A vector of size `n_samples` with the values of `Xred` assigned to each of the cluster of samples. """ + if Xt is None and Xred is None: + raise TypeError("Missing required positional argument: Xt") + + if Xred is not None and Xt is not None: + raise ValueError("Please provide only `Xt`, and not `Xred`.") + + if Xred is not None: + warnings.warn( + ( + "Input argument `Xred` was renamed to `Xt` in v1.3 and will be" + " removed in v1.5." + ), + FutureWarning, + ) + Xt = Xred + check_is_fitted(self) unil, inverse = np.unique(self.labels_, return_inverse=True) - return Xred[..., inverse] + return Xt[..., inverse] diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 971d5735fbe2b..b36999885a14e 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -23,6 +23,7 @@ ClusterMixin, TransformerMixin, ClassNamePrefixFeaturesOutMixin, + _fit_context, ) from ..metrics.pairwise import euclidean_distances from ..metrics.pairwise import _euclidean_distances @@ -1448,6 +1449,7 @@ def _warn_mkl_vcomp(self, n_active_threads): f" variable OMP_NUM_THREADS={n_active_threads}." ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None, sample_weight=None): """Compute k-means clustering. @@ -1475,8 +1477,6 @@ def fit(self, X, y=None, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() - X = self._validate_data( X, accept_sparse="csr", @@ -2057,6 +2057,7 @@ def _random_reassign(self): return True return False + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None, sample_weight=None): """Compute the centroids on X by chunking it into mini-batches. @@ -2084,8 +2085,6 @@ def fit(self, X, y=None, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() - X = self._validate_data( X, accept_sparse="csr", @@ -2214,6 +2213,7 @@ def fit(self, X, y=None, sample_weight=None): return self + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None, sample_weight=None): """Update k means estimate on a single mini-batch X. @@ -2241,9 +2241,6 @@ def partial_fit(self, X, y=None, sample_weight=None): """ has_centers = hasattr(self, "cluster_centers_") - if not has_centers: - self._validate_params() - X = self._validate_data( X, accept_sparse="csr", diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py index 46a00ed3f0740..6b0f227d011f9 100644 --- a/sklearn/cluster/_mean_shift.py +++ b/sklearn/cluster/_mean_shift.py @@ -24,6 +24,7 @@ from ..utils.parallel import delayed, Parallel from ..utils import check_random_state, gen_batches, check_array from ..base import BaseEstimator, ClusterMixin +from ..base import _fit_context from ..neighbors import NearestNeighbors from ..metrics.pairwise import pairwise_distances_argmin from .._config import config_context @@ -435,6 +436,7 @@ def __init__( self.n_jobs = n_jobs self.max_iter = max_iter + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Perform clustering. @@ -451,7 +453,6 @@ def fit(self, X, y=None): self : object Fitted instance. """ - self._validate_params() X = self._validate_data(X) bandwidth = self.bandwidth if bandwidth is None: diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index 0f1c66ada2d4e..ca1c74d6f44e7 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -24,6 +24,7 @@ from ..utils.validation import check_memory from ..neighbors import NearestNeighbors from ..base import BaseEstimator, ClusterMixin +from ..base import _fit_context from ..metrics import pairwise_distances from scipy.sparse import issparse, SparseEfficiencyWarning @@ -288,6 +289,10 @@ def __init__( self.memory = memory self.n_jobs = n_jobs + @_fit_context( + # Optics.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None): """Perform OPTICS clustering. @@ -311,8 +316,6 @@ def fit(self, X, y=None): self : object Returns a fitted instance of self. """ - self._validate_params() - dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float if dtype == bool and X.dtype != bool: msg = ( diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index e0ab7da938bfd..f72db4b7c1da3 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -15,6 +15,7 @@ from scipy.sparse import csc_matrix from ..base import BaseEstimator, ClusterMixin +from ..base import _fit_context from ..utils._param_validation import Interval, StrOptions, validate_params from ..utils import check_random_state, as_float_array from ..metrics.pairwise import pairwise_kernels, KERNEL_PARAMS @@ -649,6 +650,7 @@ def __init__( self.n_jobs = n_jobs self.verbose = verbose + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Perform spectral clustering from features, or affinity matrix. @@ -671,8 +673,6 @@ def fit(self, X, y=None): self : object A fitted instance of the estimator. """ - self._validate_params() - X = self._validate_data( X, accept_sparse=["csr", "csc", "coo"], diff --git a/sklearn/cluster/tests/test_feature_agglomeration.py b/sklearn/cluster/tests/test_feature_agglomeration.py index 3e4aa816b79c0..3db2862384c74 100644 --- a/sklearn/cluster/tests/test_feature_agglomeration.py +++ b/sklearn/cluster/tests/test_feature_agglomeration.py @@ -2,9 +2,11 @@ Tests for sklearn.cluster._feature_agglomeration """ # Authors: Sergul Aydore 2017 +import warnings import numpy as np from numpy.testing import assert_array_equal +import pytest from sklearn.cluster import FeatureAgglomeration from sklearn.utils._testing import assert_array_almost_equal from sklearn.datasets import make_blobs @@ -53,3 +55,25 @@ def test_feature_agglomeration_feature_names_out(): assert_array_equal( [f"featureagglomeration{i}" for i in range(n_clusters)], names_out ) + + +# TODO(1.5): remove this test +def test_inverse_transform_Xred_deprecation(): + X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features) + + est = FeatureAgglomeration(n_clusters=1, pooling_func=np.mean) + est.fit(X) + Xt = est.transform(X) + + with pytest.raises(TypeError, match="Missing required positional argument"): + est.inverse_transform() + + with pytest.raises(ValueError, match="Please provide only"): + est.inverse_transform(Xt=Xt, Xred=Xt) + + with warnings.catch_warnings(record=True): + warnings.simplefilter("error") + est.inverse_transform(Xt) + + with pytest.warns(FutureWarning, match="Input argument `Xred` was renamed to `Xt`"): + est.inverse_transform(Xred=Xt) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index aab021c0c8d4f..14349662cfee9 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -14,6 +14,7 @@ from scipy import sparse from ..base import clone, TransformerMixin +from ..base import _fit_context from ..utils._estimator_html_repr import _VisualBlock from ..pipeline import _fit_transform_one, _transform_one, _name_estimators from ..preprocessing import FunctionTransformer @@ -701,12 +702,15 @@ def fit(self, X, y=None): self : ColumnTransformer This estimator. """ - self._validate_params() # we use fit_transform to make sure to set sparse_output_ (for which we # need the transformed data) to have consistent output type in predict self.fit_transform(X, y=y) return self + @_fit_context( + # estimators in ColumnTransformer.transformers are not validated yet + prefer_skip_nested_validation=False + ) def fit_transform(self, X, y=None): """Fit all transformers, transform the data and concatenate results. @@ -728,7 +732,6 @@ def fit_transform(self, X, y=None): any result is a sparse matrix, everything will be converted to sparse matrices. """ - self._validate_params() self._check_feature_names(X, reset=True) X = _check_X(X) diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index f31a5a49b641e..e926ed7abe324 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -7,6 +7,7 @@ import numpy as np from ..base import BaseEstimator, RegressorMixin, clone +from ..base import _fit_context from ..utils.validation import check_is_fitted from ..utils._tags import _safe_tags from ..utils import check_array, _safe_indexing @@ -197,6 +198,10 @@ def _fit_transformer(self, y): UserWarning, ) + @_fit_context( + # TransformedTargetRegressor.regressor/transformer are not validated yet. + prefer_skip_nested_validation=False + ) def fit(self, X, y, **fit_params): """Fit the model according to the given training data. @@ -218,7 +223,6 @@ def fit(self, X, y, **fit_params): self : object Fitted estimator. """ - self._validate_params() if y is None: raise ValueError( f"This {self.__class__.__name__} estimator " diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py index 1ef0eedd62f64..c99f200592580 100644 --- a/sklearn/covariance/_elliptic_envelope.py +++ b/sklearn/covariance/_elliptic_envelope.py @@ -9,6 +9,7 @@ from ..utils.validation import check_is_fitted from ..metrics import accuracy_score from ..base import OutlierMixin +from ..base import _fit_context class EllipticEnvelope(OutlierMixin, MinCovDet): @@ -162,6 +163,7 @@ def __init__( ) self.contamination = contamination + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the EllipticEnvelope model. @@ -178,7 +180,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - # `_validate_params` is called in `MinCovDet` super().fit(X) self.offset_ = np.percentile(-self.dist_, 100.0 * self.contamination) return self diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py index 7fc23f36d92d3..8083bfd2e1aa1 100644 --- a/sklearn/covariance/_empirical_covariance.py +++ b/sklearn/covariance/_empirical_covariance.py @@ -16,6 +16,7 @@ from .. import config_context from ..base import BaseEstimator +from ..base import _fit_context from ..utils import check_array from ..utils._param_validation import validate_params from ..utils.extmath import fast_logdet @@ -218,6 +219,7 @@ def get_precision(self): precision = linalg.pinvh(self.covariance_, check_finite=False) return precision + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the maximum likelihood covariance estimator to X. @@ -235,7 +237,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() X = self._validate_data(X) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py index afe21fa3a02f1..8575cc4f75801 100644 --- a/sklearn/covariance/_graph_lasso.py +++ b/sklearn/covariance/_graph_lasso.py @@ -16,6 +16,7 @@ from . import empirical_covariance, EmpiricalCovariance, log_likelihood +from ..base import _fit_context from ..exceptions import ConvergenceWarning from ..utils.validation import ( _is_arraylike_not_scalar, @@ -532,6 +533,7 @@ def __init__( self.alpha = alpha self.covariance = covariance + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the GraphicalLasso model to X. @@ -548,7 +550,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() # Covariance does not make sense for a single feature X = self._validate_data(X, ensure_min_features=2, ensure_min_samples=2) @@ -925,6 +926,7 @@ def __init__( self.cv = cv self.n_jobs = n_jobs + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the GraphicalLasso covariance model to X. @@ -941,7 +943,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() # Covariance does not make sense for a single feature X = self._validate_data(X, ensure_min_features=2) if self.assume_centered: diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py index f3dd6d60badf8..c723bba7a097b 100644 --- a/sklearn/covariance/_robust_covariance.py +++ b/sklearn/covariance/_robust_covariance.py @@ -15,6 +15,7 @@ from scipy.stats import chi2 from . import empirical_covariance, EmpiricalCovariance +from ..base import _fit_context from ..utils.extmath import fast_logdet from ..utils import check_random_state, check_array from ..utils._param_validation import Interval @@ -719,6 +720,7 @@ def __init__( self.support_fraction = support_fraction self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit a Minimum Covariance Determinant with the FastMCD algorithm. @@ -736,7 +738,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() X = self._validate_data(X, ensure_min_samples=2, estimator="MinCovDet") random_state = check_random_state(self.random_state) n_samples, n_features = X.shape diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py index 4bf3d9a490b6b..21d2e034b45d7 100644 --- a/sklearn/covariance/_shrunk_covariance.py +++ b/sklearn/covariance/_shrunk_covariance.py @@ -18,6 +18,7 @@ import numpy as np from . import empirical_covariance, EmpiricalCovariance +from ..base import _fit_context from ..utils import check_array from ..utils._param_validation import Interval, validate_params @@ -237,6 +238,7 @@ def __init__(self, *, store_precision=True, assume_centered=False, shrinkage=0.1 ) self.shrinkage = shrinkage + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the shrunk covariance model to X. @@ -254,7 +256,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() X = self._validate_data(X) # Not calling the parent object to fit, to avoid a potential # matrix inversion when setting the precision @@ -533,6 +534,7 @@ def __init__(self, *, store_precision=True, assume_centered=False, block_size=10 ) self.block_size = block_size + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the Ledoit-Wolf shrunk covariance model to X. @@ -549,7 +551,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() # Not calling the parent object to fit, to avoid computing the # covariance matrix (and potentially the precision) X = self._validate_data(X) @@ -722,6 +723,7 @@ class OAS(EmpiricalCovariance): 0.0195... """ + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the Oracle Approximating Shrinkage covariance model to X. @@ -738,8 +740,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - X = self._validate_data(X) # Not calling the parent object to fit, to avoid computing the # covariance matrix (and potentially the precision) diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index a5e5a1ceff09a..da395d8f060fb 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -16,6 +16,7 @@ from ..base import BaseEstimator, RegressorMixin, TransformerMixin from ..base import MultiOutputMixin from ..base import ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..utils import check_array, check_consistent_length from ..utils.fixes import sp_version from ..utils.fixes import parse_version @@ -208,6 +209,7 @@ def __init__( self.tol = tol self.copy = copy + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, Y): """Fit model to data. @@ -226,8 +228,6 @@ def fit(self, X, Y): self : object Fitted model. """ - self._validate_params() - check_consistent_length(X, Y) X = self._validate_data( X, dtype=np.float64, copy=self.copy, ensure_min_samples=2 @@ -958,6 +958,7 @@ def __init__(self, n_components=2, *, scale=True, copy=True): self.scale = scale self.copy = copy + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, Y): """Fit model to data. @@ -974,8 +975,6 @@ def fit(self, X, Y): self : object Fitted estimator. """ - self._validate_params() - check_consistent_length(X, Y) X = self._validate_data( X, dtype=np.float64, copy=self.copy, ensure_min_samples=2 diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py index 7b2faa4b67f4d..bba06fbb74021 100644 --- a/sklearn/datasets/_arff_parser.py +++ b/sklearn/datasets/_arff_parser.py @@ -204,7 +204,10 @@ def _io_to_generator(gzip_file): if len(dfs) >= 2: dfs[0] = dfs[0].astype(dfs[1].dtypes) - frame = pd.concat(dfs, ignore_index=True) + # liac-arff parser does not depend on NumPy and uses None to represent + # missing values. To be consistent with the pandas parser, we replace + # None with np.nan. + frame = pd.concat(dfs, ignore_index=True).fillna(value=np.nan) del dfs, first_df # cast the columns frame diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 42f64fba2037b..c13b82dd769d3 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -920,9 +920,7 @@ def datasets_missing_values(): (1119, "liac-arff", 9, 6, 0), (1119, "pandas", 9, 0, 6), # miceprotein - # 1 column has only missing values with object dtype - (40966, "liac-arff", 1, 76, 0), - # with casting it will be transformed to either float or Int64 + (40966, "liac-arff", 1, 77, 0), (40966, "pandas", 1, 77, 0), # titanic (40945, "liac-arff", 3, 6, 0), diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index ab2f87de4bb84..54b3590f5b62e 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1796,6 +1796,7 @@ def fit(self, X, y=None): self.fit_transform(X) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): """Fit the model from data in X and return the transformed data. @@ -1813,8 +1814,6 @@ def fit_transform(self, X, y=None): V : ndarray of shape (n_samples, n_components) Transformed data. """ - self._validate_params() - _check_positive_coding(method=self.fit_algorithm, positive=self.positive_code) method = "lasso_" + self.fit_algorithm @@ -2435,6 +2434,7 @@ def fit(self, X, y=None): return self + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None): """Update the model using the data in X as a mini-batch. @@ -2454,9 +2454,6 @@ def partial_fit(self, X, y=None): """ has_components = hasattr(self, "components_") - if not has_components: - self._validate_params() - X = self._validate_data( X, dtype=[np.float64, np.float32], order="C", reset=not has_components ) diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index a6507d167b9cb..8c3d590b2c814 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -27,6 +27,7 @@ from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..utils import check_random_state from ..utils._param_validation import Interval, StrOptions from ..utils.extmath import fast_logdet, randomized_svd, squared_norm @@ -197,6 +198,7 @@ def __init__( self.random_state = random_state self.rotation = rotation + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the FactorAnalysis model to X using SVD based approach. @@ -213,8 +215,6 @@ def fit(self, X, y=None): self : object FactorAnalysis class instance. """ - self._validate_params() - X = self._validate_data(X, copy=self.copy, dtype=np.float64) n_samples, n_features = X.shape diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 680a6cd8bbee1..6dcf62c0ace3b 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -16,6 +16,7 @@ from scipy import linalg from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..exceptions import ConvergenceWarning from ..utils import check_array, as_float_array, check_random_state from ..utils.validation import check_is_fitted @@ -672,6 +673,7 @@ def g(x, fun_args): return S + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): """Fit the model and recover the sources from X. @@ -690,10 +692,9 @@ def fit_transform(self, X, y=None): Estimated sources obtained by transforming the data with the estimated unmixing matrix. """ - self._validate_params() - return self._fit_transform(X, compute_sources=True) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model to X. @@ -711,8 +712,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - self._fit_transform(X, compute_sources=False) return self diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index d98a5f4fb3b7a..5ae5d58b06ca4 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -9,6 +9,7 @@ from scipy import linalg, sparse from ._base import _BasePCA +from ..base import _fit_context from ..utils import gen_batches from ..utils._param_validation import Interval from ..utils.extmath import svd_flip, _incremental_mean_and_var @@ -192,6 +193,7 @@ def __init__(self, n_components=None, *, whiten=False, copy=True, batch_size=Non self.copy = copy self.batch_size = batch_size + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model with X, using minibatches of size batch_size. @@ -209,8 +211,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - self.components_ = None self.n_samples_seen_ = 0 self.mean_ = 0.0 @@ -243,6 +243,7 @@ def fit(self, X, y=None): return self + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None, check_input=True): """Incremental fit with X. All of X is processed as a single batch. @@ -265,9 +266,6 @@ def partial_fit(self, X, y=None, check_input=True): """ first_pass = not hasattr(self, "components_") - if first_pass: - self._validate_params() - if check_input: if sparse.issparse(X): raise TypeError( diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index fadcd6f94a2f8..61d502a006c5e 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -19,6 +19,7 @@ from ..utils._param_validation import Interval, StrOptions from ..exceptions import NotFittedError from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..preprocessing import KernelCenterer from ..metrics.pairwise import pairwise_kernels @@ -404,6 +405,7 @@ def _fit_inverse_transform(self, X_transformed, X): self.dual_coef_ = linalg.solve(K, X, assume_a="pos", overwrite_a=True) self.X_transformed_fit_ = X_transformed + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model from data in X. @@ -421,8 +423,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - if self.fit_inverse_transform and self.kernel == "precomputed": raise ValueError("Cannot fit_inverse_transform with a precomputed kernel.") X = self._validate_data(X, accept_sparse="csr", copy=self.copy_X) diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 21829d4fedab3..ab1ea5ebb5460 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -18,6 +18,7 @@ from joblib import effective_n_jobs from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..utils import check_random_state, gen_batches, gen_even_slices from ..utils.validation import check_non_negative from ..utils.validation import check_is_fitted @@ -568,6 +569,7 @@ def _check_non_neg_array(self, X, reset_n_features, whom): return X + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None): """Online VB with Mini-Batch update. @@ -586,9 +588,6 @@ def partial_fit(self, X, y=None): """ first_time = not hasattr(self, "components_") - if first_time: - self._validate_params() - X = self._check_non_neg_array( X, reset_n_features=first_time, whom="LatentDirichletAllocation.partial_fit" ) @@ -618,6 +617,7 @@ def partial_fit(self, X, y=None): return self + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Learn model for the data X with variational Bayes method. @@ -637,7 +637,6 @@ def fit(self, X, y=None): self Fitted estimator. """ - self._validate_params() X = self._check_non_neg_array( X, reset_n_features=True, whom="LatentDirichletAllocation.fit" ) diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 67dd0c2ab7b70..d561583dec205 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -19,6 +19,7 @@ from ._cdnmf_fast import _update_cdnmf_fast from .._config import config_context from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..exceptions import ConvergenceWarning from ..utils import check_random_state, check_array, gen_batches from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm @@ -31,6 +32,7 @@ StrOptions, validate_params, ) +from ..utils import metadata_routing EPSILON = np.finfo(np.float32).eps @@ -1122,6 +1124,11 @@ def non_negative_factorization( class _BaseNMF(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator, ABC): """Base class for NMF and MiniBatchNMF.""" + # This prevents ``set_split_inverse_transform`` to be generated for the + # non-standard ``W`` arg on ``inverse_transform``. + # TODO: remove when W is removed in v1.5 for inverse_transform + __metadata_request__inverse_transform = {"W": metadata_routing.UNUSED} + _parameter_constraints: dict = { "n_components": [Interval(Integral, 1, None, closed="left"), None], "init": [ @@ -1245,23 +1252,44 @@ def fit(self, X, y=None, **params): self.fit_transform(X, **params) return self - def inverse_transform(self, W): + def inverse_transform(self, Xt=None, W=None): """Transform data back to its original space. .. versionadded:: 0.18 Parameters ---------- - W : {ndarray, sparse matrix} of shape (n_samples, n_components) + Xt : {ndarray, sparse matrix} of shape (n_samples, n_components) Transformed data matrix. + W : deprecated + Use `Xt` instead. + + .. deprecated:: 1.3 + Returns ------- X : {ndarray, sparse matrix} of shape (n_samples, n_features) Returns a data matrix of the original shape. """ + if Xt is None and W is None: + raise TypeError("Missing required positional argument: Xt") + + if W is not None and Xt is not None: + raise ValueError("Please provide only `Xt`, and not `W`.") + + if W is not None: + warnings.warn( + ( + "Input argument `W` was renamed to `Xt` in v1.3 and will be removed" + " in v1.5." + ), + FutureWarning, + ) + Xt = W + check_is_fitted(self) - return W @ self.components_ + return Xt @ self.components_ @property def _n_features_out(self): @@ -1539,6 +1567,7 @@ def _check_params(self, X): return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None, W=None, H=None): """Learn a NMF model for the data X and returns the transformed data. @@ -1566,8 +1595,6 @@ def fit_transform(self, X, y=None, W=None, H=None): W : ndarray of shape (n_samples, n_components) Transformed data. """ - self._validate_params() - X = self._validate_data( X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32] ) @@ -2123,6 +2150,7 @@ def _minibatch_convergence( return False + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None, W=None, H=None): """Learn a NMF model for the data X and returns the transformed data. @@ -2149,8 +2177,6 @@ def fit_transform(self, X, y=None, W=None, H=None): W : ndarray of shape (n_samples, n_components) Transformed data. """ - self._validate_params() - X = self._validate_data( X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32] ) @@ -2288,6 +2314,7 @@ def transform(self, X): return W + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None, W=None, H=None): """Update the model using the data in `X` as a mini-batch. @@ -2321,9 +2348,6 @@ def partial_fit(self, X, y=None, W=None, H=None): """ has_components = hasattr(self, "components_") - if not has_components: - self._validate_params() - X = self._validate_data( X, accept_sparse=("csr", "csc"), diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index e8c302fc47129..1d3c0678aca89 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -20,6 +20,7 @@ from scipy.sparse.linalg import svds from ._base import _BasePCA +from ..base import _fit_context from ..utils import check_random_state from ..utils._arpack import _init_arpack_v0 from ..utils.deprecation import deprecated @@ -414,6 +415,7 @@ def __init__( def n_features_(self): return self.n_features_in_ + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model with X. @@ -431,11 +433,10 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - self._fit(X) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): """Fit the model with X and apply the dimensionality reduction on X. @@ -458,8 +459,6 @@ def fit_transform(self, X, y=None): This method returns a Fortran-ordered array. To convert it to a C-ordered array, use 'np.ascontiguousarray'. """ - self._validate_params() - U, S, Vt = self._fit(X) U = U[:, : self.n_components_] diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index 5974b86381e1a..93e4a2164a87f 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -12,6 +12,7 @@ from ..utils.validation import check_array, check_is_fitted from ..linear_model import ridge_regression from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ._dict_learning import dict_learning, MiniBatchDictionaryLearning @@ -53,6 +54,7 @@ def __init__( self.verbose = verbose self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model from data in X. @@ -70,7 +72,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() random_state = check_random_state(self.random_state) X = self._validate_data(X) diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index 999266a4f3f78..67f5c73028f15 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -12,6 +12,7 @@ from scipy.sparse.linalg import svds from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..utils import check_array, check_random_state from ..utils._arpack import _init_arpack_v0 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip @@ -200,10 +201,10 @@ def fit(self, X, y=None): self : object Returns the transformer object. """ - # param validation is done in fit_transform self.fit_transform(X) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): """Fit model to X and perform dimensionality reduction on X. @@ -220,7 +221,6 @@ def fit_transform(self, X, y=None): X_new : ndarray of shape (n_samples, n_components) Reduced version of X. This will always be a dense array. """ - self._validate_params() X = self._validate_data(X, accept_sparse=["csr", "csc"], ensure_min_features=2) random_state = check_random_state(self.random_state) diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py index 74218b83c6952..2b1ed4d91be5e 100644 --- a/sklearn/decomposition/tests/test_nmf.py +++ b/sklearn/decomposition/tests/test_nmf.py @@ -1,6 +1,7 @@ import re import sys from io import StringIO +import warnings import numpy as np import scipy.sparse as sp @@ -906,3 +907,29 @@ def test_minibatch_nmf_verbose(): nmf.fit(A) finally: sys.stdout = old_stdout + + +# TODO(1.5): remove this test +def test_NMF_inverse_transform_W_deprecation(): + rng = np.random.mtrand.RandomState(42) + A = np.abs(rng.randn(6, 5)) + est = NMF( + n_components=3, + init="random", + random_state=0, + tol=1e-6, + ) + Xt = est.fit_transform(A) + + with pytest.raises(TypeError, match="Missing required positional argument"): + est.inverse_transform() + + with pytest.raises(ValueError, match="Please provide only"): + est.inverse_transform(Xt=Xt, W=Xt) + + with warnings.catch_warnings(record=True): + warnings.simplefilter("error") + est.inverse_transform(Xt) + + with pytest.warns(FutureWarning, match="Input argument `W` was renamed to `Xt`"): + est.inverse_transform(W=Xt) diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index c8c0a656e5784..275f4ae4d3b30 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -17,6 +17,7 @@ from .base import BaseEstimator, TransformerMixin, ClassifierMixin from .base import ClassNamePrefixFeaturesOutMixin +from .base import _fit_context from .linear_model._base import LinearClassifierMixin from .covariance import ledoit_wolf, empirical_covariance, shrunk_covariance from .utils.multiclass import unique_labels @@ -546,6 +547,10 @@ def _solve_svd(self, X, y): self.coef_ = coef @ self.scalings_.T self.intercept_ -= self.xbar_ @ self.coef_.T + @_fit_context( + # LinearDiscriminantAnalysis.covariance_estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """Fit the Linear Discriminant Analysis model. @@ -568,8 +573,6 @@ def fit(self, X, y): self : object Fitted estimator. """ - self._validate_params() - xp, _ = get_namespace(X) X, y = self._validate_data( @@ -865,6 +868,7 @@ def __init__( self.store_covariance = store_covariance self.tol = tol + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit the model according to the given training data and parameters. @@ -889,7 +893,6 @@ def fit(self, X, y): self : object Fitted estimator. """ - self._validate_params() X, y = self._validate_data(X, y) check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 25f910e8419f4..0d8519484d7a5 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -11,6 +11,7 @@ from .base import BaseEstimator, ClassifierMixin, RegressorMixin from .base import MultiOutputMixin +from .base import _fit_context from .utils import check_random_state from .utils._param_validation import StrOptions, Interval from .utils.validation import _num_samples @@ -142,6 +143,7 @@ def __init__(self, *, strategy="prior", random_state=None, constant=None): self.random_state = random_state self.constant = constant + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the baseline classifier. @@ -161,8 +163,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() - self._strategy = self.strategy if self._strategy == "uniform" and sp.issparse(y): @@ -518,6 +518,7 @@ def __init__(self, *, strategy="mean", constant=None, quantile=None): self.constant = constant self.quantile = quantile + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the random regressor. @@ -537,8 +538,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() - y = check_array(y, ensure_2d=False, input_name="y") if len(y) == 0: raise ValueError("y must not be empty.") diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index bad6dcfb033ec..0354413fdebfe 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -14,6 +14,7 @@ from ._base import BaseEnsemble, _partition_estimators from ..base import ClassifierMixin, RegressorMixin +from ..base import _fit_context from ..metrics import r2_score, accuracy_score from ..tree import DecisionTreeClassifier, DecisionTreeRegressor from ..utils import check_random_state, column_or_1d @@ -301,6 +302,10 @@ def __init__( self.random_state = random_state self.verbose = verbose + @_fit_context( + # BaseBagging.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None): """Build a Bagging ensemble of estimators from the training set (X, y). @@ -324,9 +329,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - - self._validate_params() - # Convert data (X is required to be 2d and indexable) X, y = self._validate_data( X, diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 4d9bf862bd806..e715952947c04 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -50,7 +50,7 @@ class calls the ``fit`` method of each sub-estimator on random samples from scipy.sparse import issparse from scipy.sparse import hstack as sparse_hstack -from sklearn.base import is_classifier +from sklearn.base import is_classifier, _fit_context from sklearn.base import ( ClassifierMixin, MultiOutputMixin, @@ -221,6 +221,7 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): None, Interval(Integral, 1, None, closed="left"), ], + "store_leaf_values": [bool], } @abstractmethod @@ -240,6 +241,7 @@ def __init__( max_samples=None, base_estimator="deprecated", max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=estimator, @@ -257,6 +259,7 @@ def __init__( self.class_weight = class_weight self.max_samples = max_samples self.max_bins = max_bins + self.store_leaf_values = store_leaf_values def apply(self, X): """ @@ -333,6 +336,7 @@ def decision_path(self, X): return sparse_hstack(indicators).tocsr(), n_nodes_ptr + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """ Build a forest of trees from the training set (X, y). @@ -360,8 +364,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() - # Validate or convert input data if issparse(y): raise ValueError("sparse multilabel-indicator for y is not supported.") @@ -717,6 +719,139 @@ def _bin_data(self, X, is_training_data): return X_binned + def predict_quantiles(self, X, quantiles=0.5, method="nearest"): + """Predict class or regression value for X at given quantiles. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + quantiles : float, optional + The quantiles at which to evaluate, by default 0.5 (median). + method : str, optional + The method to interpolate, by default 'linear'. Can be any keyword + argument accepted by :func:`np.quantile`. + check_input : bool, optional + Whether or not to check input, by default True. + + Returns + ------- + y : ndarray of shape (n_samples, n_quantiles) or + (n_samples, n_quantiles, n_outputs) + The predicted values. + """ + if not self.store_leaf_values: + raise RuntimeError( + "Quantile prediction is not available when store_leaf_values=False" + ) + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + if not isinstance(quantiles, (np.ndarray, list)): + quantiles = np.array([quantiles]) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + + # Assign chunk of trees to jobs + # n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + if self.n_outputs_ > 1: + y_hat = np.zeros( + (X.shape[0], len(quantiles), self.n_outputs_), dtype=np.float64 + ) + else: + y_hat = np.zeros((X.shape[0], len(quantiles)), dtype=np.float64) + + # get (n_samples, n_estimators) indicator of leaf nodes + X_leaves = self.apply(X) + + # we now want to aggregate all leaf samples across all trees for each sample + for idx in range(X.shape[0]): + # get leaf nodes for this sample + leaf_nodes = X_leaves[idx, :] + + # (n_total_leaf_samples, n_outputs) + leaf_node_samples = np.vstack( + ( + est.leaf_nodes_samples_[leaf_nodes[jdx]] + for jdx, est in enumerate(self.estimators_) + ) + ) + + # get quantiles across all leaf node samples + y_hat[idx, ...] = np.quantile( + leaf_node_samples, quantiles, axis=0, interpolation=method + ) + + if is_classifier(self): + if self.n_outputs_ == 1: + for i in range(len(quantiles)): + class_pred_per_sample = y_hat[idx, i, :].squeeze().astype(int) + y_hat[idx, ...] = self.classes_.take( + class_pred_per_sample, axis=0 + ) + else: + for k in range(self.n_outputs_): + for i in range(len(quantiles)): + class_pred_per_sample = ( + y_hat[idx, i, k].squeeze().astype(int) + ) + y_hat[idx, i, k] = self.classes_[k].take( + class_pred_per_sample, axis=0 + ) + return y_hat + + def get_leaf_node_samples(self, X): + """For each datapoint x in X, get the training samples in the leaf node. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Dataset to apply the forest to. + + Returns + ------- + leaf_node_samples : a list of array-like of shape + (n_leaf_node_samples, n_outputs) + Each sample is represented by the indices of the training samples that + reached the leaf node. The ``n_leaf_node_samples`` may vary between + samples, since the number of samples that fall in a leaf node is + variable. + """ + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + + # Assign chunk of trees to jobs + n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + result = Parallel(n_jobs=n_jobs, verbose=self.verbose)( + delayed(_accumulate_leaf_nodes_samples)(e.get_leaf_node_samples, X) + for e in self.estimators_ + ) + leaf_nodes_samples = result[0] + for result_ in result[1:]: + for i, node_samples in enumerate(result_): + leaf_nodes_samples[i] = np.vstack((leaf_nodes_samples[i], node_samples)) + return leaf_nodes_samples + def _accumulate_prediction(predict, X, out, lock): """ @@ -734,6 +869,17 @@ def _accumulate_prediction(predict, X, out, lock): out[i] += prediction[i] +def _accumulate_leaf_nodes_samples(func, X): + """ + This is a utility function for joblib's Parallel. + + It can't go locally in ForestClassifier or ForestRegressor, because joblib + complains that it cannot pickle it when placed there. + """ + leaf_nodes_samples = func(X, check_input=False) + return leaf_nodes_samples + + class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): """ Base class for forest of trees-based classifiers. @@ -759,6 +905,7 @@ def __init__( max_samples=None, base_estimator="deprecated", max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=estimator, @@ -774,6 +921,7 @@ def __init__( max_samples=max_samples, base_estimator=base_estimator, max_bins=max_bins, + store_leaf_values=store_leaf_values, ) @staticmethod @@ -1037,6 +1185,7 @@ def __init__( max_samples=None, base_estimator="deprecated", max_bins=None, + store_leaf_values=False, ): super().__init__( estimator, @@ -1051,6 +1200,7 @@ def __init__( max_samples=max_samples, base_estimator=base_estimator, max_bins=max_bins, + store_leaf_values=store_leaf_values, ) def predict(self, X): @@ -1515,6 +1665,7 @@ def __init__( ccp_alpha=0.0, max_samples=None, max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=DecisionTreeClassifier(), @@ -1530,6 +1681,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", ), bootstrap=bootstrap, oob_score=oob_score, @@ -1540,6 +1692,7 @@ def __init__( class_weight=class_weight, max_samples=max_samples, max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -1858,6 +2011,7 @@ def __init__( ccp_alpha=0.0, max_samples=None, max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=DecisionTreeRegressor(), @@ -1873,6 +2027,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", ), bootstrap=bootstrap, oob_score=oob_score, @@ -1882,6 +2037,7 @@ def __init__( warm_start=warm_start, max_samples=max_samples, max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2210,6 +2366,7 @@ def __init__( ccp_alpha=0.0, max_samples=None, max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=ExtraTreeClassifier(), @@ -2225,6 +2382,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", ), bootstrap=bootstrap, oob_score=oob_score, @@ -2235,6 +2393,7 @@ def __init__( class_weight=class_weight, max_samples=max_samples, max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2534,6 +2693,7 @@ def __init__( ccp_alpha=0.0, max_samples=None, max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=ExtraTreeRegressor(), @@ -2549,6 +2709,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", ), bootstrap=bootstrap, oob_score=oob_score, @@ -2558,6 +2719,7 @@ def __init__( warm_start=warm_start, max_samples=max_samples, max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2783,6 +2945,7 @@ def __init__( random_state=None, verbose=0, warm_start=False, + store_leaf_values=False, ): super().__init__( estimator=ExtraTreeRegressor(), @@ -2797,6 +2960,7 @@ def __init__( "max_leaf_nodes", "min_impurity_decrease", "random_state", + "store_leaf_values", ), bootstrap=False, oob_score=False, @@ -2805,6 +2969,7 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=None, + store_leaf_values=store_leaf_values, ) self.max_depth = max_depth @@ -2848,6 +3013,7 @@ def fit(self, X, y=None, sample_weight=None): self.fit_transform(X, y, sample_weight=sample_weight) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None, sample_weight=None): """ Fit estimator and transform dataset. @@ -2873,8 +3039,6 @@ def fit_transform(self, X, y=None, sample_weight=None): X_transformed : sparse matrix of shape (n_samples, n_out) Transformed dataset. """ - self._validate_params() - rnd = check_random_state(self.random_state) y = rnd.uniform(size=_num_samples(X)) super().fit(X, y, sample_weight=sample_weight) diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index df9904c8a9aa4..8d435873aeb5c 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -28,6 +28,7 @@ from ._base import BaseEnsemble from ..base import ClassifierMixin, RegressorMixin from ..base import is_classifier +from ..base import _fit_context from ._gradient_boosting import predict_stages from ._gradient_boosting import predict_stage @@ -146,6 +147,7 @@ class BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta): "n_iter_no_change": [Interval(Integral, 1, None, closed="left"), None], "tol": [Interval(Real, 0.0, None, closed="left")], } + _parameter_constraints.pop("store_leaf_values") _parameter_constraints.pop("splitter") @abstractmethod @@ -376,6 +378,10 @@ def _check_initialized(self): """Check that the estimator is initialized, raising an error if not.""" check_is_fitted(self) + @_fit_context( + # GradientBoosting*.init is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None, monitor=None): """Fit the gradient boosting model. @@ -412,8 +418,6 @@ def fit(self, X, y, sample_weight=None, monitor=None): self : object Fitted estimator. """ - self._validate_params() - if not self.warm_start: self._clear_state() diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 976335ea684d0..79b640057abe5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -18,6 +18,7 @@ PinballLoss, ) from ...base import BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier +from ...base import _fit_context from ...utils import check_random_state, resample, compute_sample_weight from ...utils.validation import ( check_is_fitted, @@ -336,6 +337,7 @@ def _check_interaction_cst(self, n_features): return constraints + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the gradient boosting model. @@ -357,8 +359,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() - fit_start_time = time() acc_find_split_time = 0.0 # time spent finding the best splits acc_apply_split_time = 0.0 # time spent splitting nodes diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index bb016fa33185b..048a1d69395e2 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -20,6 +20,7 @@ from ..utils._param_validation import RealNotInt from ..utils.validation import check_is_fitted, _num_samples from ..base import OutlierMixin +from ..base import _fit_context from ._bagging import BaseBagging @@ -265,6 +266,7 @@ def _parallel_args(self): # copies. return {"prefer": "threads"} + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None, sample_weight=None): """ Fit estimator. @@ -287,7 +289,6 @@ def fit(self, X, y=None, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() X = self._validate_data(X, accept_sparse=["csc"], dtype=tree_dtype) if issparse(X): # Pre-sort indices to avoid that each individual tree of the diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 10f7a606f20c9..5b3486edfeb33 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -13,6 +13,7 @@ from ..base import clone from ..base import ClassifierMixin, RegressorMixin, TransformerMixin from ..base import is_classifier, is_regressor +from ..base import _fit_context from ..exceptions import NotFittedError from ..utils._estimator_html_repr import _VisualBlock @@ -159,6 +160,10 @@ def _method_name(name, estimator, method): return method_name + @_fit_context( + # estimators in Stacking*.estimators are not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None): """Fit the estimators. @@ -184,9 +189,6 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - - self._validate_params() - # all_estimators contains all estimators, the one to be fitted and the # 'drop' string. names, all_estimators = self._validate_estimators() diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index 1c250cbe11a06..f8f4d2c4c197f 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -22,6 +22,7 @@ from ..base import RegressorMixin from ..base import TransformerMixin from ..base import clone +from ..base import _fit_context from ._base import _fit_single_estimator from ._base import _BaseHeterogeneousEnsemble from ..preprocessing import LabelEncoder @@ -308,6 +309,10 @@ def __init__( self.flatten_transform = flatten_transform self.verbose = verbose + @_fit_context( + # estimators in VotingClassifier.estimators are not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None): """Fit the estimators. @@ -332,7 +337,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() check_classification_targets(y) if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1: raise NotImplementedError( @@ -572,6 +576,10 @@ def __init__(self, estimators, *, weights=None, n_jobs=None, verbose=False): self.n_jobs = n_jobs self.verbose = verbose + @_fit_context( + # estimators in VotingRegressor.estimators are not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None): """Fit the estimators. @@ -594,7 +602,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() y = column_or_1d(y, warn=True) return super().fit(X, y, sample_weight) diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index b2aff503b0bb0..569609e6326e5 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -34,7 +34,7 @@ from ._base import BaseEnsemble from ..base import ClassifierMixin, RegressorMixin, is_classifier, is_regressor - +from ..base import _fit_context from ..tree import DecisionTreeClassifier, DecisionTreeRegressor from ..utils import check_random_state, _safe_indexing from ..utils.extmath import softmax @@ -103,6 +103,10 @@ def _check_X(self, X): reset=False, ) + @_fit_context( + # AdaBoost*.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None): """Build a boosted classifier/regressor from the training set (X, y). @@ -124,8 +128,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() - X, y = self._validate_data( X, y, diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index d96f5c76842bf..a78e12a5a5181 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -1984,3 +1984,54 @@ def test_regression_criterion_withbins(name, criterion): criterion, score, ) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +def test_multioutput_quantiles(name): + # Check estimators on multi-output problems. + X_train = [ + [-2, -1], + [-1, -1], + [-1, -2], + [1, 1], + [1, 2], + [2, 1], + [-2, 1], + [-1, 1], + [-1, 2], + [2, -1], + [1, -1], + [1, -2], + ] + y_train = [ + [-1, 0], + [-1, 0], + [-1, 0], + [1, 1], + [1, 1], + [1, 1], + [-1, 2], + [-1, 2], + [-1, 2], + [1, 3], + [1, 3], + [1, 3], + ] + X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]] + y_test = [[-1, 0], [1, 1], [-1, 2], [1, 3]] + + est = FOREST_ESTIMATORS[name]( + random_state=0, bootstrap=False, store_leaf_values=True + ) + est.fit(X_train, y_train) + + y_pred = est.predict_quantiles(X_test, quantiles=[0.25, 0.5, 0.75]) + assert_array_almost_equal(y_pred[:, 1, :], y_test) + assert_array_almost_equal(y_pred[:, 0, :], y_test) + assert_array_almost_equal(y_pred[:, 2, :], y_test) + + # test the leaf nodes samples + leaf_nodes_samples = est.get_leaf_node_samples(X_test) + assert len(leaf_nodes_samples) == len(X_test) + for node_samples in leaf_nodes_samples: + assert node_samples.shape[1] == est.n_outputs_ diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index b51ccceaac9d1..60e2cb3b7ad84 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -11,6 +11,7 @@ import scipy.sparse as sp from ..base import BaseEstimator, TransformerMixin +from ..base import _fit_context from ..utils import check_array from ..utils.validation import check_is_fitted @@ -133,6 +134,7 @@ def _add_iterable_element( indices.append(vocab[feature_name]) values.append(self.dtype(vv)) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Learn a list of feature name -> indices mappings. @@ -153,7 +155,6 @@ def fit(self, X, y=None): self : object DictVectorizer class instance. """ - self._validate_params() feature_names = [] vocab = {} @@ -286,6 +287,7 @@ def _transform(self, X, fitting): return result_matrix + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): """Learn a list of feature name -> indices mappings and transform X. @@ -309,7 +311,6 @@ def fit_transform(self, X, y=None): Xa : {array, sparse matrix} Feature vectors; always 2-d. """ - self._validate_params() return self._transform(X, fitting=True) def inverse_transform(self, X, dict_type=dict): diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index 1f2513e70eed5..e1b5e5f2561fe 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -8,6 +8,7 @@ import scipy.sparse as sp from ..base import BaseEstimator, TransformerMixin +from ..base import _fit_context from ._hashing_fast import transform as _hashing_transform from ..utils._param_validation import Interval, StrOptions @@ -121,6 +122,7 @@ def __init__( self.n_features = n_features self.alternate_sign = alternate_sign + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X=None, y=None): """Only validates estimator's parameters. @@ -140,8 +142,6 @@ def fit(self, X=None, y=None): self : object FeatureHasher class instance. """ - # repeat input validation for grid search (which calls set_params) - self._validate_params() return self def transform(self, raw_X): diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py index 89bdd7557f583..beea3e23e0adc 100644 --- a/sklearn/feature_extraction/image.py +++ b/sklearn/feature_extraction/image.py @@ -16,6 +16,7 @@ from numpy.lib.stride_tricks import as_strided from ..base import BaseEstimator, TransformerMixin +from ..base import _fit_context from ..utils import check_array, check_random_state from ..utils._param_validation import Hidden, Interval, validate_params from ..utils._param_validation import RealNotInt @@ -561,6 +562,7 @@ def __init__(self, *, patch_size=None, max_patches=None, random_state=None): self.max_patches = max_patches self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Only validate the parameters of the estimator. @@ -583,7 +585,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() return self def transform(self, X): diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 21863d75eff2f..3201e3a0d51bb 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -25,6 +25,7 @@ import scipy.sparse as sp from ..base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin +from ..base import _fit_context from ..preprocessing import normalize from ._hash import FeatureHasher from ._stop_words import ENGLISH_STOP_WORDS @@ -801,6 +802,7 @@ def __init__( self.alternate_sign = alternate_sign self.dtype = dtype + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None): """Only validates estimator's parameters. @@ -820,10 +822,9 @@ def partial_fit(self, X, y=None): self : object HashingVectorizer instance. """ - # TODO: only validate during the first call - self._validate_params() return self + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Only validates estimator's parameters. @@ -843,8 +844,6 @@ def fit(self, X, y=None): self : object HashingVectorizer instance. """ - self._validate_params() - # triggers a parameter validation if isinstance(X, str): raise ValueError( @@ -1338,6 +1337,7 @@ def fit(self, raw_documents, y=None): self.fit_transform(raw_documents) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, raw_documents, y=None): """Learn the vocabulary dictionary and return document-term matrix. @@ -1365,7 +1365,6 @@ def fit_transform(self, raw_documents, y=None): "Iterable over raw text documents expected, string object received." ) - self._validate_params() self._validate_ngram_range() self._warn_for_unused_params() self._validate_vocabulary() @@ -1639,6 +1638,7 @@ def __init__(self, *, norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=Fal self.smooth_idf = smooth_idf self.sublinear_tf = sublinear_tf + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Learn the idf vector (global term weights). @@ -1655,8 +1655,6 @@ def fit(self, X, y=None): self : object Fitted transformer. """ - self._validate_params() - # large sparse data is not supported for 32bit platforms because # _document_frequency uses np.bincount which works on arrays of # dtype NPY_INTP which is int32 for 32bit platforms. See #20923 @@ -2073,6 +2071,7 @@ def _check_params(self): UserWarning, ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, raw_documents, y=None): """Learn vocabulary and idf from training set. @@ -2089,7 +2088,6 @@ def fit(self, raw_documents, y=None): self : object Fitted vectorizer. """ - self._validate_params() self._check_params() self._warn_for_unused_params() self._tfidf = TfidfTransformer( diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index 7b8de4ae03585..47f98d89e8abe 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -9,6 +9,7 @@ from ._base import SelectorMixin from ._base import _get_feature_importances from ..base import BaseEstimator, clone, MetaEstimatorMixin +from ..base import _fit_context from ..utils._tags import _safe_tags from ..utils.validation import check_is_fitted, check_scalar, _num_features from ..utils._param_validation import HasMethods, Interval, Options @@ -320,6 +321,10 @@ def _check_max_features(self, X): ) self.max_features_ = max_features + @_fit_context( + # SelectFromModel.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None, **fit_params): """Fit the SelectFromModel meta-transformer. @@ -340,7 +345,6 @@ def fit(self, X, y=None, **fit_params): self : object Fitted estimator. """ - self._validate_params() self._check_max_features(X) if self.prefit: @@ -375,6 +379,10 @@ def threshold_(self): return _calculate_threshold(self.estimator, scores, self.threshold) @available_if(_estimator_has("partial_fit")) + @_fit_context( + # SelectFromModel.estimator is not validated yet + prefer_skip_nested_validation=False + ) def partial_fit(self, X, y=None, **fit_params): """Fit the SelectFromModel meta-transformer only once. @@ -398,7 +406,6 @@ def partial_fit(self, X, y=None, **fit_params): first_call = not hasattr(self, "estimator_") if first_call: - self._validate_params() self._check_max_features(X) if self.prefit: diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 214ac9e0c30cf..932d66449ae22 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -22,6 +22,7 @@ from ..base import MetaEstimatorMixin from ..base import clone from ..base import is_classifier +from ..base import _fit_context from ..model_selection import check_cv from ..model_selection._validation import _score from ..metrics import check_scoring @@ -228,6 +229,10 @@ def classes_(self): """ return self.estimator_.classes_ + @_fit_context( + # RFE.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, **fit_params): """Fit the RFE model and then the underlying estimator on the selected features. @@ -248,7 +253,6 @@ def fit(self, X, y, **fit_params): self : object Fitted estimator. """ - self._validate_params() return self._fit(X, y, **fit_params) def _fit(self, X, y, step_score=None, **fit_params): @@ -649,6 +653,10 @@ def __init__( self.n_jobs = n_jobs self.min_features_to_select = min_features_to_select + @_fit_context( + # RFECV.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, groups=None): """Fit the RFE model and automatically tune the number of selected features. @@ -674,7 +682,6 @@ def fit(self, X, y, groups=None): self : object Fitted estimator. """ - self._validate_params() tags = self._get_tags() X, y = self._validate_data( X, diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py index 8a61bdee0c554..0fbe91273053b 100644 --- a/sklearn/feature_selection/_sequential.py +++ b/sklearn/feature_selection/_sequential.py @@ -7,6 +7,7 @@ from ._base import SelectorMixin from ..base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier +from ..base import _fit_context from ..utils._param_validation import HasMethods, Interval, StrOptions from ..utils._param_validation import RealNotInt from ..utils._tags import _safe_tags @@ -179,6 +180,10 @@ def __init__( self.cv = cv self.n_jobs = n_jobs + @_fit_context( + # SequentialFeatureSelector.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None): """Learn the features to select from X. @@ -197,8 +202,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - tags = self._get_tags() X = self._validate_data( X, diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 18e23d105b8bb..f4355c39f88cd 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -13,6 +13,7 @@ from scipy.sparse import issparse from ..base import BaseEstimator +from ..base import _fit_context from ..preprocessing import LabelBinarizer from ..utils import as_float_array, check_array, check_X_y, safe_sqr, safe_mask from ..utils.extmath import safe_sparse_dot, row_norms @@ -473,6 +474,7 @@ class _BaseFilter(SelectorMixin, BaseEstimator): def __init__(self, score_func): self.score_func = score_func + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Run score function on (X, y) and get the appropriate features. @@ -490,8 +492,6 @@ def fit(self, X, y): self : object Returns the instance itself. """ - self._validate_params() - X, y = self._validate_data( X, y, accept_sparse=["csr", "csc"], multi_output=True ) diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py index 7c8db9cc7fa55..073a22c6ad92b 100644 --- a/sklearn/feature_selection/_variance_threshold.py +++ b/sklearn/feature_selection/_variance_threshold.py @@ -4,6 +4,7 @@ import numpy as np from ..base import BaseEstimator +from ..base import _fit_context from ._base import SelectorMixin from ..utils.sparsefuncs import mean_variance_axis, min_max_axis from ..utils.validation import check_is_fitted @@ -76,6 +77,7 @@ class VarianceThreshold(SelectorMixin, BaseEstimator): def __init__(self, threshold=0.0): self.threshold = threshold + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Learn empirical variances from X. @@ -94,7 +96,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() X = self._validate_data( X, accept_sparse=("csr", "csc"), diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py index 4a88034768870..50a8739372972 100644 --- a/sklearn/gaussian_process/_gpc.py +++ b/sklearn/gaussian_process/_gpc.py @@ -13,6 +13,7 @@ from scipy.special import erf, expit from ..base import BaseEstimator, ClassifierMixin, clone +from ..base import _fit_context from .kernels import Kernel, RBF, CompoundKernel, ConstantKernel as C from ..utils.validation import check_is_fitted from ..utils import check_random_state @@ -679,6 +680,7 @@ def __init__( self.multi_class = multi_class self.n_jobs = n_jobs + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit Gaussian process classification model. @@ -695,8 +697,6 @@ def fit(self, X, y): self : object Returns an instance of self. """ - self._validate_params() - if isinstance(self.kernel, CompoundKernel): raise ValueError("kernel cannot be a CompoundKernel") diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index 9b7141f71b884..49fcab40c25f8 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -14,6 +14,7 @@ from ..base import BaseEstimator, RegressorMixin, clone from ..base import MultiOutputMixin +from ..base import _fit_context from .kernels import Kernel, RBF, ConstantKernel as C from ..preprocessing._data import _handle_zeros_in_scale from ..utils import check_random_state @@ -214,6 +215,7 @@ def __init__( self.n_targets = n_targets self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit Gaussian process regression model. @@ -230,8 +232,6 @@ def fit(self, X, y): self : object GaussianProcessRegressor class instance. """ - self._validate_params() - if self.kernel is None: # Use an RBF kernel as default self.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF( 1.0, length_scale_bounds="fixed" diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index b2f296c91740e..37fc43731514a 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -11,6 +11,7 @@ from scipy import sparse as sp from ..base import BaseEstimator, TransformerMixin +from ..base import _fit_context from ..utils._param_validation import StrOptions, MissingValues from ..utils.fixes import _mode from ..utils.sparsefuncs import _get_median @@ -348,6 +349,7 @@ def _validate_input(self, X, in_fit): return X + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the imputer on `X`. @@ -365,8 +367,6 @@ def fit(self, X, y=None): self : object Fitted estimator. """ - self._validate_params() - X = self._validate_input(X, in_fit=True) # default fill_value is 0 for numerical input and "missing_value" @@ -927,6 +927,7 @@ def _fit(self, X, y=None, precomputed=False): return missing_features_info[0] + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the transformer on `X`. @@ -944,7 +945,6 @@ def fit(self, X, y=None): self : object Fitted estimator. """ - self._validate_params() self._fit(X, y) return self @@ -990,6 +990,7 @@ def transform(self, X): return imputer_mask + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): """Generate missing values indicator for `X`. @@ -1008,7 +1009,6 @@ def fit_transform(self, X, y=None): The missing indicator for input data. The data type of `Xt` will be boolean. """ - self._validate_params() imputer_mask = self._fit(X, y) if self.features_.size < self._n_features: diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 41ed19b7a8948..f977e5bc23e6c 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -7,6 +7,7 @@ import numpy as np from ..base import clone +from ..base import _fit_context from ..exceptions import ConvergenceWarning from ..preprocessing import normalize from ..utils import ( @@ -627,7 +628,7 @@ def _initial_imputation(self, X, in_fit=False): strategy=self.initial_strategy, fill_value=self.fill_value, keep_empty_features=self.keep_empty_features, - ) + ).set_output(transform="default") X_filled = self.initial_imputer_.fit_transform(X) else: X_filled = self.initial_imputer_.transform(X) @@ -681,6 +682,10 @@ def _validate_limit(limit, limit_type, n_features): ) return limit + @_fit_context( + # IterativeImputer.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit_transform(self, X, y=None): """Fit the imputer on `X` and return the transformed `X`. @@ -698,7 +703,6 @@ def fit_transform(self, X, y=None): Xt : array-like, shape (n_samples, n_features) The imputed input data. """ - self._validate_params() self.random_state_ = getattr( self, "random_state_", check_random_state(self.random_state) ) diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index 5735709dd7f29..915f8cbdb3fcb 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -6,6 +6,7 @@ import numpy as np from ._base import _BaseImputer +from ..base import _fit_context from ..utils.validation import FLOAT_DTYPES from ..metrics import pairwise_distances_chunked from ..metrics.pairwise import _NAN_METRICS @@ -199,6 +200,7 @@ def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col): return np.ma.average(donors, axis=1, weights=weight_matrix).data + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the imputer on X. @@ -216,7 +218,6 @@ def fit(self, X, y=None): self : object The fitted `KNNImputer` class instance. """ - self._validate_params() # Check data integrity and calling arguments if not is_scalar_nan(self.missing_values): force_all_finite = True diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py index aa1521ab697d0..a1cf95b95591b 100644 --- a/sklearn/isotonic.py +++ b/sklearn/isotonic.py @@ -11,6 +11,7 @@ import math from .base import BaseEstimator, TransformerMixin, RegressorMixin +from .base import _fit_context from .utils import check_array, check_consistent_length from .utils.validation import _check_sample_weight, check_is_fitted from .utils._param_validation import Interval, StrOptions @@ -310,6 +311,7 @@ def _build_y(self, X, y, sample_weight, trim_duplicates=True): # prediction speed). return X, y + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the model using X, y as training data. @@ -338,7 +340,6 @@ def fit(self, X, y, sample_weight=None): X is stored for future use, as :meth:`transform` needs X to interpolate new input data. """ - self._validate_params() check_params = dict(accept_sparse=False, ensure_2d=False) X = check_array( X, input_name="X", dtype=[np.float64, np.float32], **check_params diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index faa098e634937..7f190a2b66823 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -23,6 +23,7 @@ from .base import BaseEstimator from .base import TransformerMixin from .base import ClassNamePrefixFeaturesOutMixin +from .base import _fit_context from .utils import check_random_state from .utils import deprecated from .utils.extmath import safe_sparse_dot @@ -139,6 +140,7 @@ def __init__( self.n_components = n_components self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model with X. @@ -160,8 +162,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - X = self._validate_data(X, accept_sparse="csc") random_state = check_random_state(self.random_state) @@ -338,6 +338,7 @@ def __init__(self, *, gamma=1.0, n_components=100, random_state=None): self.n_components = n_components self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model with X. @@ -358,8 +359,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - X = self._validate_data(X, accept_sparse="csr") random_state = check_random_state(self.random_state) n_features = X.shape[1] @@ -498,6 +497,7 @@ def __init__(self, *, skewedness=1.0, n_components=100, random_state=None): self.n_components = n_components self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model with X. @@ -518,7 +518,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() X = self._validate_data(X) random_state = check_random_state(self.random_state) n_features = X.shape[1] @@ -665,6 +664,7 @@ def __init__(self, *, sample_steps=2, sample_interval=None): self.sample_steps = sample_steps self.sample_interval = sample_interval + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Only validates estimator's parameters. @@ -686,7 +686,6 @@ def fit(self, X, y=None): self : object Returns the transformer. """ - self._validate_params() X = self._validate_data(X, accept_sparse="csr") check_non_negative(X, "X in AdditiveChi2Sampler.fit") @@ -1011,6 +1010,7 @@ def __init__( self.random_state = random_state self.n_jobs = n_jobs + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit estimator to data. @@ -1032,7 +1032,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() X = self._validate_data(X, accept_sparse="csr") rnd = check_random_state(self.random_state) n_samples = X.shape[0] diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index 111e62938f096..a7bfeefaef651 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -8,6 +8,7 @@ import numpy as np from .base import BaseEstimator, RegressorMixin, MultiOutputMixin +from .base import _fit_context from .utils._param_validation import Interval, StrOptions from .metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS, pairwise_kernels from .linear_model._ridge import _solve_cholesky_kernel @@ -170,6 +171,7 @@ def _get_kernel(self, X, Y=None): def _more_tags(self): return {"pairwise": self.kernel == "precomputed"} + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit Kernel Ridge regression model. @@ -190,8 +192,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() - # Convert data X, y = self._validate_data( X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 06d8664dc013b..92c067c850225 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -28,6 +28,7 @@ from numbers import Integral from ..base import BaseEstimator, ClassifierMixin, RegressorMixin, MultiOutputMixin +from ..base import _fit_context from ..preprocessing._data import _is_constant_feature from ..utils import check_array from ..utils.validation import FLOAT_DTYPES @@ -642,6 +643,7 @@ def __init__( self.n_jobs = n_jobs self.positive = positive + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """ Fit linear model. @@ -665,9 +667,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted Estimator. """ - - self._validate_params() - n_jobs_ = self.n_jobs accept_sparse = False if self.positive else ["csr", "csc", "coo"] diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py index 887c6a3ebcbbc..37dc3b81511f5 100644 --- a/sklearn/linear_model/_bayes.py +++ b/sklearn/linear_model/_bayes.py @@ -13,6 +13,7 @@ from ._base import LinearModel, _preprocess_data, _rescale_data from ..base import RegressorMixin +from ..base import _fit_context from ..utils.extmath import fast_logdet from scipy.linalg import pinvh from ..utils.validation import _check_sample_weight @@ -267,6 +268,7 @@ def __init__( self.verbose = verbose self.n_iter = n_iter + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the model. @@ -288,8 +290,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() - max_iter = _deprecate_n_iter(self.n_iter, self.max_iter) X, y = self._validate_data(X, y, dtype=[np.float64, np.float32], y_numeric=True) @@ -665,6 +665,7 @@ def __init__( self.verbose = verbose self.n_iter = n_iter + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit the model according to the given training data and parameters. @@ -683,9 +684,6 @@ def fit(self, X, y): self : object Fitted estimator. """ - - self._validate_params() - max_iter = _deprecate_n_iter(self.n_iter, self.max_iter) X, y = self._validate_data( diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index ea1ee3115ea93..829c0ab6149f1 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -18,6 +18,7 @@ from ._base import LinearModel, _pre_fit from ..base import RegressorMixin, MultiOutputMixin +from ..base import _fit_context from ._base import _preprocess_data from ..utils import check_array, check_scalar from ..utils.validation import check_random_state @@ -851,6 +852,7 @@ def __init__( self.random_state = random_state self.selection = selection + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None, check_input=True): """Fit model with coordinate descent. @@ -886,8 +888,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): To avoid memory re-allocation it is advised to allocate the initial data in memory directly using that format. """ - self._validate_params() - if self.alpha == 0: warnings.warn( ( @@ -1475,6 +1475,7 @@ def _is_multitask(self): def path(X, y, **kwargs): """Compute path with coordinate descent.""" + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit linear model with coordinate descent. @@ -1502,9 +1503,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns an instance of fitted model. """ - - self._validate_params() - # This makes sure that there is no duplication in memory. # Dealing right with copy_X is important in the following: # Multiple functions touch X and subsamples of X and can induce a @@ -2343,6 +2341,7 @@ def __init__( self.random_state = random_state self.selection = selection + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit MultiTaskElasticNet model with coordinate descent. @@ -2367,8 +2366,6 @@ def fit(self, X, y): To avoid memory re-allocation it is advised to allocate the initial data in memory directly using that format. """ - self._validate_params() - # Need to validate separately here. # We can't pass multi_output=True because that would allow y to be csr. check_X_params = dict( diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index caf37a0f473e0..b1bc460f24dff 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -20,6 +20,7 @@ HalfTweedieLossIdentity, ) from ...base import BaseEstimator, RegressorMixin +from ...base import _fit_context from ...utils import check_array from ...utils._openmp_helpers import _openmp_effective_n_threads from ...utils._param_validation import Hidden, Interval, StrOptions @@ -168,6 +169,7 @@ def __init__( self.warm_start = warm_start self.verbose = verbose + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit a Generalized Linear Model. @@ -187,8 +189,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted model. """ - self._validate_params() - X, y = self._validate_data( X, y, diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py index a7b848f647560..def2ae273d5c4 100644 --- a/sklearn/linear_model/_huber.py +++ b/sklearn/linear_model/_huber.py @@ -7,6 +7,7 @@ from scipy import optimize from ..base import BaseEstimator, RegressorMixin +from ..base import _fit_context from ._base import LinearModel from ..utils import axis0_safe_slice from ..utils._param_validation import Interval @@ -273,6 +274,7 @@ def __init__( self.fit_intercept = fit_intercept self.tol = tol + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. @@ -293,7 +295,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted `HuberRegressor` estimator. """ - self._validate_params() X, y = self._validate_data( X, y, diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index 4be8bb730a0ae..e6c653eb80bb3 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -20,6 +20,7 @@ from ._base import LinearModel, LinearRegression from ._base import _deprecate_normalize, _preprocess_data from ..base import RegressorMixin, MultiOutputMixin +from ..base import _fit_context # mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs' from ..utils import arrayfuncs, as_float_array # type: ignore @@ -1097,6 +1098,7 @@ def _fit(self, X, y, max_iter, alpha, fit_path, normalize, Xy=None): self._set_intercept(X_offset, y_offset, X_scale) return self + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, Xy=None): """Fit the model using X, y as training data. @@ -1118,8 +1120,6 @@ def fit(self, X, y, Xy=None): self : object Returns an instance of self. """ - self._validate_params() - X, y = self._validate_data(X, y, y_numeric=True, multi_output=True) _normalize = _deprecate_normalize( @@ -1691,6 +1691,7 @@ def __init__( def _more_tags(self): return {"multioutput": False} + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit the model using X, y as training data. @@ -1707,8 +1708,6 @@ def fit(self, X, y): self : object Returns an instance of self. """ - self._validate_params() - _normalize = _deprecate_normalize( self.normalize, estimator_name=self.__class__.__name__ ) @@ -2216,6 +2215,7 @@ def __init__( def _more_tags(self): return {"multioutput": False} + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, copy_X=None): """Fit the model using X, y as training data. @@ -2237,8 +2237,6 @@ def fit(self, X, y, copy_X=None): self : object Returns an instance of self. """ - self._validate_params() - _normalize = _deprecate_normalize( self.normalize, estimator_name=self.__class__.__name__ ) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 3db27d9cc3163..30a0f40a0f2fd 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -24,6 +24,7 @@ from ._linear_loss import LinearModelLoss from ._sag import sag_solver from ._glm.glm import NewtonCholeskySolver +from ..base import _fit_context from .._loss.loss import HalfBinomialLoss, HalfMultinomialLoss from ..preprocessing import LabelEncoder, LabelBinarizer from ..svm._base import _fit_liblinear @@ -1132,6 +1133,7 @@ def __init__( self.n_jobs = n_jobs self.l1_ratio = l1_ratio + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """ Fit the model according to the given training data. @@ -1161,9 +1163,6 @@ def fit(self, X, y, sample_weight=None): ----- The SAGA solver supports both float64 and float32 bit arrays. """ - - self._validate_params() - solver = _check_solver(self.solver, self.penalty, self.dual) if self.penalty != "elasticnet" and self.l1_ratio is not None: @@ -1745,6 +1744,7 @@ def __init__( self.random_state = random_state self.l1_ratios = l1_ratios + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. @@ -1766,9 +1766,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted LogisticRegressionCV estimator. """ - - self._validate_params() - solver = _check_solver(self.solver, self.penalty, self.dual) if self.penalty == "elasticnet": diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py index b1dc1e352fd62..df451a99417b0 100644 --- a/sklearn/linear_model/_omp.py +++ b/sklearn/linear_model/_omp.py @@ -15,6 +15,7 @@ from ._base import LinearModel, _pre_fit, _deprecate_normalize from ..base import RegressorMixin, MultiOutputMixin +from ..base import _fit_context from ..utils import as_float_array, check_array from ..utils.parallel import delayed, Parallel from ..utils._param_validation import Hidden, Interval, StrOptions @@ -725,6 +726,7 @@ def __init__( self.normalize = normalize self.precompute = precompute + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit the model using X, y as training data. @@ -741,8 +743,6 @@ def fit(self, X, y): self : object Returns an instance of self. """ - self._validate_params() - _normalize = _deprecate_normalize( self.normalize, estimator_name=self.__class__.__name__ ) @@ -1042,6 +1042,7 @@ def __init__( self.n_jobs = n_jobs self.verbose = verbose + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit the model using X, y as training data. @@ -1058,8 +1059,6 @@ def fit(self, X, y): self : object Returns an instance of self. """ - self._validate_params() - _normalize = _deprecate_normalize( self.normalize, estimator_name=self.__class__.__name__ ) diff --git a/sklearn/linear_model/_passive_aggressive.py b/sklearn/linear_model/_passive_aggressive.py index 2cacd4f78cc54..a9c81799c8ca3 100644 --- a/sklearn/linear_model/_passive_aggressive.py +++ b/sklearn/linear_model/_passive_aggressive.py @@ -5,6 +5,7 @@ from ._stochastic_gradient import BaseSGDClassifier from ._stochastic_gradient import BaseSGDRegressor from ._stochastic_gradient import DEFAULT_EPSILON +from ..base import _fit_context from ..utils._param_validation import Interval, StrOptions @@ -220,6 +221,7 @@ def __init__( self.C = C self.loss = loss + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y, classes=None): """Fit linear model with Passive Aggressive algorithm. @@ -245,7 +247,6 @@ def partial_fit(self, X, y, classes=None): Fitted estimator. """ if not hasattr(self, "classes_"): - self._validate_params() self._more_validate_params(for_partial_fit=True) if self.class_weight == "balanced": @@ -276,6 +277,7 @@ def partial_fit(self, X, y, classes=None): intercept_init=None, ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, coef_init=None, intercept_init=None): """Fit linear model with Passive Aggressive algorithm. @@ -298,7 +300,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None): self : object Fitted estimator. """ - self._validate_params() self._more_validate_params() lr = "pa1" if self.loss == "hinge" else "pa2" @@ -504,6 +505,7 @@ def __init__( self.C = C self.loss = loss + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y): """Fit linear model with Passive Aggressive algorithm. @@ -521,7 +523,6 @@ def partial_fit(self, X, y): Fitted estimator. """ if not hasattr(self, "coef_"): - self._validate_params() self._more_validate_params(for_partial_fit=True) lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2" @@ -538,6 +539,7 @@ def partial_fit(self, X, y): intercept_init=None, ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, coef_init=None, intercept_init=None): """Fit linear model with Passive Aggressive algorithm. @@ -560,7 +562,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None): self : object Fitted estimator. """ - self._validate_params() self._more_validate_params() lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2" diff --git a/sklearn/linear_model/_quantile.py b/sklearn/linear_model/_quantile.py index 081e3da5b51b7..b4a5581386a5f 100644 --- a/sklearn/linear_model/_quantile.py +++ b/sklearn/linear_model/_quantile.py @@ -9,6 +9,7 @@ from scipy.optimize import linprog from ..base import BaseEstimator, RegressorMixin +from ..base import _fit_context from ._base import LinearModel from ..exceptions import ConvergenceWarning from ..utils import _safe_indexing @@ -141,6 +142,7 @@ def __init__( self.solver = solver self.solver_options = solver_options + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. @@ -160,7 +162,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns self. """ - self._validate_params() X, y = self._validate_data( X, y, diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 2474a25f07199..1c12ecc13a258 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -9,6 +9,7 @@ from ..base import BaseEstimator, MetaEstimatorMixin, RegressorMixin, clone from ..base import MultiOutputMixin +from ..base import _fit_context from ..utils import check_random_state, check_consistent_length from ..utils.random import sample_without_replacement from ..utils.validation import check_is_fitted, _check_sample_weight @@ -283,6 +284,10 @@ def __init__( self.random_state = random_state self.loss = loss + @_fit_context( + # RansacRegressor.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None): """Fit estimator using RANSAC algorithm. @@ -313,8 +318,6 @@ def fit(self, X, y, sample_weight=None): `is_data_valid` and `is_model_valid` return False for all `max_trials` randomly chosen sub-samples. """ - self._validate_params() - # Need to validate separately here. We can't pass multi_output=True # because that would allow y to be csr. Delay expensive finiteness # check to the estimator's own input validation. diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 28ef7cbd43eb7..893b10d1d93ae 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -25,6 +25,7 @@ from ._base import _preprocess_data, _rescale_data from ._sag import sag_solver from ..base import MultiOutputMixin, RegressorMixin, is_classifier +from ..base import _fit_context from ..utils.extmath import safe_sparse_dot from ..utils.extmath import row_norms from ..utils import check_array @@ -1114,6 +1115,7 @@ def __init__( random_state=random_state, ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit Ridge regression model. @@ -1134,8 +1136,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() - _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) X, y = self._validate_data( X, @@ -1423,6 +1423,7 @@ def __init__( ) self.class_weight = class_weight + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit Ridge classifier model. @@ -1446,8 +1447,6 @@ def fit(self, X, y, sample_weight=None): self : object Instance of the estimator. """ - self._validate_params() - X, y, sample_weight, Y = self._prepare_data(X, y, sample_weight, self.solver) super().fit(X, Y, sample_weight=sample_weight) @@ -2354,6 +2353,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): 0.5166... """ + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit Ridge regression model with cv. @@ -2383,8 +2383,6 @@ def fit(self, X, y, sample_weight=None): cross-validation takes the sample weights into account when computing the validation score. """ - self._validate_params() - super().fit(X, y, sample_weight=sample_weight) return self @@ -2533,6 +2531,7 @@ def __init__( ) self.class_weight = class_weight + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit Ridge classifier with cv. @@ -2555,8 +2554,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() - # `RidgeClassifier` does not accept "sag" or "saga" solver and thus support # csr, csc, and coo sparse matrices. By using solver="eigen" we force to accept # all sparse format. diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 2f27bdee7968b..bc8f31016c6f8 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -13,6 +13,7 @@ from numbers import Integral, Real from ..base import clone, is_classifier +from ..base import _fit_context from ._base import LinearClassifierMixin, SparseCoefMixin from ._base import make_dataset from ..base import BaseEstimator, RegressorMixin, OutlierMixin @@ -805,6 +806,7 @@ def _fit_multiclass(self, X, y, alpha, C, learning_rate, sample_weight, max_iter self._standard_intercept = np.atleast_1d(self.intercept_) self.intercept_ = self._standard_intercept + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y, classes=None, sample_weight=None): """Perform one epoch of stochastic gradient descent on given samples. @@ -839,7 +841,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): Returns an instance of self. """ if not hasattr(self, "classes_"): - self._validate_params() self._more_validate_params(for_partial_fit=True) if self.class_weight == "balanced": @@ -869,6 +870,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): intercept_init=None, ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None): """Fit linear model with Stochastic Gradient Descent. @@ -897,7 +899,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None): self : object Returns an instance of self. """ - self._validate_params() self._more_validate_params() return self._fit( @@ -1470,6 +1471,7 @@ def _partial_fit( return self + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y, sample_weight=None): """Perform one epoch of stochastic gradient descent on given samples. @@ -1496,7 +1498,6 @@ def partial_fit(self, X, y, sample_weight=None): Returns an instance of self. """ if not hasattr(self, "coef_"): - self._validate_params() self._more_validate_params(for_partial_fit=True) return self._partial_fit( @@ -1565,6 +1566,7 @@ def _fit( return self + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None): """Fit linear model with Stochastic Gradient Descent. @@ -1590,7 +1592,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None): self : object Fitted `SGDRegressor` estimator. """ - self._validate_params() self._more_validate_params() return self._fit( @@ -2366,6 +2367,7 @@ def _partial_fit( return self + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None, sample_weight=None): """Fit linear One-Class SVM with Stochastic Gradient Descent. @@ -2386,7 +2388,6 @@ def partial_fit(self, X, y=None, sample_weight=None): Returns a fitted instance of self. """ if not hasattr(self, "coef_"): - self._validate_params() self._more_validate_params(for_partial_fit=True) alpha = self.nu / 2 @@ -2453,6 +2454,7 @@ def _fit( return self + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None, coef_init=None, offset_init=None, sample_weight=None): """Fit linear One-Class SVM with Stochastic Gradient Descent. @@ -2485,7 +2487,6 @@ def fit(self, X, y=None, coef_init=None, offset_init=None, sample_weight=None): self : object Returns a fitted instance of self. """ - self._validate_params() self._more_validate_params() alpha = self.nu / 2 diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py index 67d6ca532a8ab..72c2d897681c4 100644 --- a/sklearn/linear_model/_theil_sen.py +++ b/sklearn/linear_model/_theil_sen.py @@ -19,6 +19,7 @@ from ._base import LinearModel from ..base import RegressorMixin +from ..base import _fit_context from ..utils import check_random_state from ..utils._param_validation import Interval from ..utils.parallel import delayed, Parallel @@ -395,6 +396,7 @@ def _check_subparams(self, n_samples, n_features): return n_subsamples, n_subpopulation + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit linear model. @@ -410,7 +412,6 @@ def fit(self, X, y): self : returns an instance of self. Fitted `TheilSenRegressor` estimator. """ - self._validate_params() random_state = check_random_state(self.random_state) X, y = self._validate_data(X, y, y_numeric=True) n_samples, n_features = X.shape diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index 92206721aac15..0917ef7d207bc 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -12,6 +12,7 @@ from scipy.sparse.csgraph import connected_components from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..neighbors import NearestNeighbors, kneighbors_graph from ..neighbors import radius_neighbors_graph from ..utils.validation import check_is_fitted @@ -235,7 +236,7 @@ def _fit_transform(self, X): tol=self.tol, max_iter=self.max_iter, n_jobs=self.n_jobs, - ) + ).set_output(transform="default") if self.n_neighbors is not None: nbg = kneighbors_graph( @@ -332,6 +333,10 @@ def reconstruction_error(self): evals = self.kernel_pca_.eigenvalues_ return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0] + @_fit_context( + # Isomap.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None): """Compute the embedding vectors for data X. @@ -350,10 +355,13 @@ def fit(self, X, y=None): self : object Returns a fitted instance of self. """ - self._validate_params() self._fit_transform(X) return self + @_fit_context( + # Isomap.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit_transform(self, X, y=None): """Fit the model from data in X and transform X. @@ -371,7 +379,6 @@ def fit_transform(self, X, y=None): X_new : array-like, shape (n_samples, n_components) X transformed in the new space. """ - self._validate_params() self._fit_transform(X) return self.embedding_ diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py index 10a22b12dfd1d..6f57b0627b8be 100644 --- a/sklearn/manifold/_locally_linear.py +++ b/sklearn/manifold/_locally_linear.py @@ -17,6 +17,7 @@ TransformerMixin, _UnstableArchMixin, ClassNamePrefixFeaturesOutMixin, + _fit_context, ) from ..utils import check_random_state, check_array from ..utils._arpack import _init_arpack_v0 @@ -759,6 +760,7 @@ def _fit_transform(self, X): ) self._n_features_out = self.embedding_.shape[1] + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Compute the embedding vectors for data X. @@ -775,10 +777,10 @@ def fit(self, X, y=None): self : object Fitted `LocallyLinearEmbedding` class instance. """ - self._validate_params() self._fit_transform(X) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): """Compute the embedding vectors for data X and transform X. @@ -795,7 +797,6 @@ def fit_transform(self, X, y=None): X_new : array-like, shape (n_samples, n_components) Returns the instance itself. """ - self._validate_params() self._fit_transform(X) return self.embedding_ diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 7fc46325a1ae1..6b7a818b94ea8 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -13,6 +13,7 @@ import warnings from ..base import BaseEstimator +from ..base import _fit_context from ..metrics import euclidean_distances from ..utils import check_random_state, check_array, check_symmetric from ..isotonic import IsotonicRegression @@ -569,10 +570,10 @@ def fit(self, X, y=None, init=None): self : object Fitted estimator. """ - # parameter will be validated in `fit_transform` call self.fit_transform(X, init=init) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None, init=None): """ Fit the data from `X`, and returns the embedded coordinates. @@ -597,7 +598,6 @@ def fit_transform(self, X, y=None, init=None): X_new : ndarray of shape (n_samples, n_components) X transformed in the new space. """ - self._validate_params() X = self._validate_data(X) if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": warnings.warn( diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index 8291d8326eb05..af965a1362b8f 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -17,6 +17,7 @@ from scipy.sparse.csgraph import laplacian as csgraph_laplacian from ..base import BaseEstimator +from ..base import _fit_context from ..utils import ( check_array, check_random_state, @@ -652,6 +653,7 @@ def _get_affinity_matrix(self, X, Y=None): self.affinity_matrix_ = self.affinity(X) return self.affinity_matrix_ + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model from data in X. @@ -674,8 +676,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - X = self._validate_data(X, accept_sparse="csr", ensure_min_samples=2) random_state = check_random_state(self.random_state) diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index 6ef6ce999cb08..c372ddcca3c2e 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -17,6 +17,7 @@ from numbers import Integral, Real from ..neighbors import NearestNeighbors from ..base import BaseEstimator, ClassNamePrefixFeaturesOutMixin, TransformerMixin +from ..base import _fit_context from ..utils import check_random_state from ..utils._openmp_helpers import _openmp_effective_n_threads from ..utils.validation import check_non_negative @@ -1078,6 +1079,10 @@ def _tsne( return X_embedded + @_fit_context( + # TSNE.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit_transform(self, X, y=None): """Fit X into an embedded space and return that transformed output. @@ -1099,12 +1104,15 @@ def fit_transform(self, X, y=None): X_new : ndarray of shape (n_samples, n_components) Embedding of the training data in low-dimensional space. """ - self._validate_params() self._check_params_vs_input(X) embedding = self._fit(X) self.embedding_ = embedding return self.embedding_ + @_fit_context( + # TSNE.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None): """Fit X into an embedded space. @@ -1126,7 +1134,6 @@ def fit(self, X, y=None): X_new : array of shape (n_samples, n_components) Embedding of the training data in low-dimensional space. """ - self._validate_params() self.fit_transform(X) return self diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 67b04e9382acb..dbe5b76f0f4c9 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -919,8 +919,9 @@ def haversine_distances(X, Y=None): in radians. The dimension of the data must be 2. .. math:: - D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2) - + \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}] + D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x_{lat} - y_{lat}) / 2) + + \\cos(x_{lat})\\cos(y_{lat})\\ + sin^2((x_{lon} - y_{lon}) / 2)}] Parameters ---------- @@ -1220,6 +1221,13 @@ def paired_cosine_distances(X, Y): } +@validate_params( + { + "X": ["array-like"], + "Y": ["array-like"], + "metric": [StrOptions(set(PAIRED_DISTANCES)), callable], + } +) def paired_distances(X, Y, *, metric="euclidean", **kwds): """ Compute the paired distances between X and Y. @@ -1278,8 +1286,6 @@ def paired_distances(X, Y, *, metric="euclidean", **kwds): for i in range(len(X)): distances[i] = metric(X[i], Y[i]) return distances - else: - raise ValueError("Unknown distance %s" % metric) # Kernels diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py index a298dfec6a0da..fbca4f1d49dcd 100644 --- a/sklearn/mixture/_base.py +++ b/sklearn/mixture/_base.py @@ -16,6 +16,7 @@ from ..cluster import kmeans_plusplus from ..base import BaseEstimator from ..base import DensityMixin +from ..base import _fit_context from ..exceptions import ConvergenceWarning from ..utils import check_random_state from ..utils.validation import check_is_fitted @@ -182,6 +183,7 @@ def fit(self, X, y=None): self.fit_predict(X, y) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_predict(self, X, y=None): """Estimate model parameters using X and predict the labels for X. @@ -209,8 +211,6 @@ def fit_predict(self, X, y=None): labels : array, shape (n_samples,) Component labels. """ - self._validate_params() - X = self._validate_data(X, dtype=[np.float64, np.float32], ensure_min_samples=2) if X.shape[0] < self.n_components: raise ValueError( diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py index 76dc02e625408..4a3f5d1e239a8 100644 --- a/sklearn/model_selection/__init__.py +++ b/sklearn/model_selection/__init__.py @@ -33,6 +33,7 @@ from ._search import ParameterSampler from ._plot import LearningCurveDisplay +from ._plot import ValidationCurveDisplay if typing.TYPE_CHECKING: # Avoid errors in type checkers (e.g. mypy) for experimental estimators. @@ -74,6 +75,7 @@ "permutation_test_score", "train_test_split", "validation_curve", + "ValidationCurveDisplay", ] diff --git a/sklearn/model_selection/_plot.py b/sklearn/model_selection/_plot.py index 6a6133a722251..bc5a600e57234 100644 --- a/sklearn/model_selection/_plot.py +++ b/sklearn/model_selection/_plot.py @@ -1,10 +1,140 @@ +import warnings + import numpy as np -from . import learning_curve +from . import learning_curve, validation_curve from ..utils import check_matplotlib_support +from ..utils._plotting import _validate_score_name, _interval_max_min_ratio + + +class _BaseCurveDisplay: + def _plot_curve( + self, + x_data, + *, + ax=None, + negate_score=False, + score_name=None, + score_type="test", + log_scale="deprecated", + std_display_style="fill_between", + line_kw=None, + fill_between_kw=None, + errorbar_kw=None, + ): + check_matplotlib_support(f"{self.__class__.__name__}.plot") + + import matplotlib.pyplot as plt + + if ax is None: + _, ax = plt.subplots() + + if negate_score: + train_scores, test_scores = -self.train_scores, -self.test_scores + else: + train_scores, test_scores = self.train_scores, self.test_scores + + if std_display_style not in ("errorbar", "fill_between", None): + raise ValueError( + f"Unknown std_display_style: {std_display_style}. Should be one of" + " 'errorbar', 'fill_between', or None." + ) + + if score_type not in ("test", "train", "both"): + raise ValueError( + f"Unknown score_type: {score_type}. Should be one of 'test', " + "'train', or 'both'." + ) + + if score_type == "train": + scores = {"Train": train_scores} + elif score_type == "test": + scores = {"Test": test_scores} + else: # score_type == "both" + scores = {"Train": train_scores, "Test": test_scores} + + if std_display_style in ("fill_between", None): + # plot the mean score + if line_kw is None: + line_kw = {} + + self.lines_ = [] + for line_label, score in scores.items(): + self.lines_.append( + *ax.plot( + x_data, + score.mean(axis=1), + label=line_label, + **line_kw, + ) + ) + self.errorbar_ = None + self.fill_between_ = None # overwritten below by fill_between + + if std_display_style == "errorbar": + if errorbar_kw is None: + errorbar_kw = {} + + self.errorbar_ = [] + for line_label, score in scores.items(): + self.errorbar_.append( + ax.errorbar( + x_data, + score.mean(axis=1), + score.std(axis=1), + label=line_label, + **errorbar_kw, + ) + ) + self.lines_, self.fill_between_ = None, None + elif std_display_style == "fill_between": + if fill_between_kw is None: + fill_between_kw = {} + default_fill_between_kw = {"alpha": 0.5} + fill_between_kw = {**default_fill_between_kw, **fill_between_kw} + + self.fill_between_ = [] + for line_label, score in scores.items(): + self.fill_between_.append( + ax.fill_between( + x_data, + score.mean(axis=1) - score.std(axis=1), + score.mean(axis=1) + score.std(axis=1), + **fill_between_kw, + ) + ) + + score_name = self.score_name if score_name is None else score_name + + ax.legend() + # TODO(1.5): to be removed + if log_scale != "deprecated": + warnings.warn( + ( + "The `log_scale` parameter is deprecated as of version 1.3 " + "and will be removed in 1.5. You can use display.ax_.set_xscale " + "and display.ax_.set_yscale instead." + ), + FutureWarning, + ) + xscale = "log" if log_scale else "linear" + else: + # We found that a ratio, smaller or bigger than 5, between the largest and + # smallest gap of the x values is a good indicator to choose between linear + # and log scale. + if _interval_max_min_ratio(x_data) > 5: + xscale = "symlog" if x_data.min() <= 0 else "log" + else: + xscale = "linear" + ax.set_xscale(xscale) + ax.set_ylabel(f"{score_name}") -class LearningCurveDisplay: + self.ax_ = ax + self.figure_ = ax.figure + + +class LearningCurveDisplay(_BaseCurveDisplay): """Learning Curve visualization. It is recommended to use @@ -12,7 +142,10 @@ class LearningCurveDisplay: create a :class:`~sklearn.model_selection.LearningCurveDisplay` instance. All parameters are stored as attributes. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide ` for general information + about the visualization API and + :ref:`detailed documentation ` regarding the learning + curve visualization. .. versionadded:: 1.2 @@ -29,9 +162,12 @@ class LearningCurveDisplay: Scores on test set. score_name : str, default=None - The name of the score used in `learning_curve`. It will be used to - decorate the y-axis. If `None`, the generic name `"Score"` will be - used. + The name of the score used in `learning_curve`. It will override the name + inferred from the `scoring` parameter. If `score` is `None`, we use `"Score"` if + `negate_score` is `False` and `"Negative score"` otherwise. If `scoring` is a + string or a callable, we infer the name. We replace `_` by spaces and capitalize + the first letter. We remove `neg_` and replace it by `"Negative"` if + `negate_score` is `False` or just remove it otherwise. Attributes ---------- @@ -89,8 +225,8 @@ def plot( *, negate_score=False, score_name=None, - score_type="test", - log_scale=False, + score_type="both", + log_scale="deprecated", std_display_style="fill_between", line_kw=None, fill_between_kw=None, @@ -111,16 +247,25 @@ def plot( `scikit-learn`. score_name : str, default=None - The name of the score used to decorate the y-axis of the plot. If - `None`, the generic name "Score" will be used. - - score_type : {"test", "train", "both"}, default="test" + The name of the score used to decorate the y-axis of the plot. It will + override the name inferred from the `scoring` parameter. If `score` is + `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"` + otherwise. If `scoring` is a string or a callable, we infer the name. We + replace `_` by spaces and capitalize the first letter. We remove `neg_` and + replace it by `"Negative"` if `negate_score` is + `False` or just remove it otherwise. + + score_type : {"test", "train", "both"}, default="both" The type of score to plot. Can be one of `"test"`, `"train"`, or `"both"`. - log_scale : bool, default=False + log_scale : bool, default="deprecated" Whether or not to use a logarithmic scale for the x-axis. + .. deprecated:: 1.3 + `log_scale` is deprecated in 1.3 and will be removed in 1.5. + Use `display.ax_.set_xscale` and `display.ax_.set_yscale` instead. + std_display_style : {"errorbar", "fill_between"} or None, default="fill_between" The style used to display the score standard deviation around the mean score. If None, no standard deviation representation is @@ -143,98 +288,19 @@ def plot( display : :class:`~sklearn.model_selection.LearningCurveDisplay` Object that stores computed values. """ - check_matplotlib_support(f"{self.__class__.__name__}.plot") - - import matplotlib.pyplot as plt - - if ax is None: - _, ax = plt.subplots() - - if negate_score: - train_scores, test_scores = -self.train_scores, -self.test_scores - else: - train_scores, test_scores = self.train_scores, self.test_scores - - if std_display_style not in ("errorbar", "fill_between", None): - raise ValueError( - f"Unknown std_display_style: {std_display_style}. Should be one of" - " 'errorbar', 'fill_between', or None." - ) - - if score_type not in ("test", "train", "both"): - raise ValueError( - f"Unknown score_type: {score_type}. Should be one of 'test', " - "'train', or 'both'." - ) - - if score_type == "train": - scores = {"Training metric": train_scores} - elif score_type == "test": - scores = {"Testing metric": test_scores} - else: # score_type == "both" - scores = {"Training metric": train_scores, "Testing metric": test_scores} - - if std_display_style in ("fill_between", None): - # plot the mean score - if line_kw is None: - line_kw = {} - - self.lines_ = [] - for line_label, score in scores.items(): - self.lines_.append( - *ax.plot( - self.train_sizes, - score.mean(axis=1), - label=line_label, - **line_kw, - ) - ) - self.errorbar_ = None - self.fill_between_ = None # overwritten below by fill_between - - if std_display_style == "errorbar": - if errorbar_kw is None: - errorbar_kw = {} - - self.errorbar_ = [] - for line_label, score in scores.items(): - self.errorbar_.append( - ax.errorbar( - self.train_sizes, - score.mean(axis=1), - score.std(axis=1), - label=line_label, - **errorbar_kw, - ) - ) - self.lines_, self.fill_between_ = None, None - elif std_display_style == "fill_between": - if fill_between_kw is None: - fill_between_kw = {} - default_fill_between_kw = {"alpha": 0.5} - fill_between_kw = {**default_fill_between_kw, **fill_between_kw} - - self.fill_between_ = [] - for line_label, score in scores.items(): - self.fill_between_.append( - ax.fill_between( - self.train_sizes, - score.mean(axis=1) - score.std(axis=1), - score.mean(axis=1) + score.std(axis=1), - **fill_between_kw, - ) - ) - - score_name = self.score_name if score_name is None else score_name - - ax.legend() - if log_scale: - ax.set_xscale("log") - ax.set_xlabel("Number of samples in the training set") - ax.set_ylabel(f"{score_name}") - - self.ax_ = ax - self.figure_ = ax.figure + self._plot_curve( + self.train_sizes, + ax=ax, + negate_score=negate_score, + score_name=score_name, + score_type=score_type, + log_scale=log_scale, + std_display_style=std_display_style, + line_kw=line_kw, + fill_between_kw=fill_between_kw, + errorbar_kw=errorbar_kw, + ) + self.ax_.set_xlabel("Number of samples in the training set") return self @classmethod @@ -259,8 +325,8 @@ def from_estimator( ax=None, negate_score=False, score_name=None, - score_type="test", - log_scale=False, + score_type="both", + log_scale="deprecated", std_display_style="fill_between", line_kw=None, fill_between_kw=None, @@ -268,6 +334,11 @@ def from_estimator( ): """Create a learning curve display from an estimator. + Read more in the :ref:`User Guide ` for general + information about the visualization API and :ref:`detailed + documentation ` regarding the learning curve + visualization. + Parameters ---------- estimator : object type that implements the "fit" and "predict" methods @@ -368,16 +439,25 @@ def from_estimator( `scikit-learn`. score_name : str, default=None - The name of the score used to decorate the y-axis of the plot. - If `None`, the generic `"Score"` name will be used. - - score_type : {"test", "train", "both"}, default="test" + The name of the score used to decorate the y-axis of the plot. It will + override the name inferred from the `scoring` parameter. If `score` is + `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"` + otherwise. If `scoring` is a string or a callable, we infer the name. We + replace `_` by spaces and capitalize the first letter. We remove `neg_` and + replace it by `"Negative"` if `negate_score` is + `False` or just remove it otherwise. + + score_type : {"test", "train", "both"}, default="both" The type of score to plot. Can be one of `"test"`, `"train"`, or `"both"`. - log_scale : bool, default=False + log_scale : bool, default="deprecated" Whether or not to use a logarithmic scale for the x-axis. + .. deprecated:: 1.3 + `log_scale` is deprecated in 1.3 and will be removed in 1.5. + Use `display.ax_.xscale` and `display.ax_.yscale` instead. + std_display_style : {"errorbar", "fill_between"} or None, default="fill_between" The style used to display the score standard deviation around the mean score. If `None`, no representation of the standard deviation @@ -414,7 +494,7 @@ def from_estimator( """ check_matplotlib_support(f"{cls.__name__}.from_estimator") - score_name = "Score" if score_name is None else score_name + score_name = _validate_score_name(score_name, scoring, negate_score) train_sizes, train_scores, test_scores = learning_curve( estimator, @@ -451,3 +531,377 @@ def from_estimator( fill_between_kw=fill_between_kw, errorbar_kw=errorbar_kw, ) + + +class ValidationCurveDisplay(_BaseCurveDisplay): + """Validation Curve visualization. + + It is recommended to use + :meth:`~sklearn.model_selection.ValidationCurveDisplay.from_estimator` to + create a :class:`~sklearn.model_selection.ValidationCurveDisplay` instance. + All parameters are stored as attributes. + + Read more in the :ref:`User Guide ` for general information + about the visualization API and :ref:`detailed documentation + ` regarding the validation curve visualization. + + .. versionadded:: 1.3 + + Parameters + ---------- + param_name : str + Name of the parameter that has been varied. + + param_range : ndarray of shape (n_ticks,) + The values of the parameter that have been evaluated. + + train_scores : ndarray of shape (n_ticks, n_cv_folds) + Scores on training sets. + + test_scores : ndarray of shape (n_ticks, n_cv_folds) + Scores on test set. + + score_name : str, default=None + The name of the score used in `validation_curve`. It will override the name + inferred from the `scoring` parameter. If `score` is `None`, we use `"Score"` if + `negate_score` is `False` and `"Negative score"` otherwise. If `scoring` is a + string or a callable, we infer the name. We replace `_` by spaces and capitalize + the first letter. We remove `neg_` and replace it by `"Negative"` if + `negate_score` is `False` or just remove it otherwise. + + Attributes + ---------- + ax_ : matplotlib Axes + Axes with the validation curve. + + figure_ : matplotlib Figure + Figure containing the validation curve. + + errorbar_ : list of matplotlib Artist or None + When the `std_display_style` is `"errorbar"`, this is a list of + `matplotlib.container.ErrorbarContainer` objects. If another style is + used, `errorbar_` is `None`. + + lines_ : list of matplotlib Artist or None + When the `std_display_style` is `"fill_between"`, this is a list of + `matplotlib.lines.Line2D` objects corresponding to the mean train and + test scores. If another style is used, `line_` is `None`. + + fill_between_ : list of matplotlib Artist or None + When the `std_display_style` is `"fill_between"`, this is a list of + `matplotlib.collections.PolyCollection` objects. If another style is + used, `fill_between_` is `None`. + + See Also + -------- + sklearn.model_selection.validation_curve : Compute the validation curve. + + Examples + -------- + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import ValidationCurveDisplay, validation_curve + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = make_classification(n_samples=1_000, random_state=0) + >>> logistic_regression = LogisticRegression() + >>> param_name, param_range = "C", np.logspace(-8, 3, 10) + >>> train_scores, test_scores = validation_curve( + ... logistic_regression, X, y, param_name=param_name, param_range=param_range + ... ) + >>> display = ValidationCurveDisplay( + ... param_name=param_name, param_range=param_range, + ... train_scores=train_scores, test_scores=test_scores, score_name="Score" + ... ) + >>> display.plot() + <...> + >>> plt.show() + """ + + def __init__( + self, *, param_name, param_range, train_scores, test_scores, score_name=None + ): + self.param_name = param_name + self.param_range = param_range + self.train_scores = train_scores + self.test_scores = test_scores + self.score_name = score_name + + def plot( + self, + ax=None, + *, + negate_score=False, + score_name=None, + score_type="both", + std_display_style="fill_between", + line_kw=None, + fill_between_kw=None, + errorbar_kw=None, + ): + """Plot visualization. + + Parameters + ---------- + ax : matplotlib Axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + negate_score : bool, default=False + Whether or not to negate the scores obtained through + :func:`~sklearn.model_selection.validation_curve`. This is + particularly useful when using the error denoted by `neg_*` in + `scikit-learn`. + + score_name : str, default=None + The name of the score used to decorate the y-axis of the plot. It will + override the name inferred from the `scoring` parameter. If `score` is + `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"` + otherwise. If `scoring` is a string or a callable, we infer the name. We + replace `_` by spaces and capitalize the first letter. We remove `neg_` and + replace it by `"Negative"` if `negate_score` is + `False` or just remove it otherwise. + + score_type : {"test", "train", "both"}, default="both" + The type of score to plot. Can be one of `"test"`, `"train"`, or + `"both"`. + + std_display_style : {"errorbar", "fill_between"} or None, default="fill_between" + The style used to display the score standard deviation around the + mean score. If None, no standard deviation representation is + displayed. + + line_kw : dict, default=None + Additional keyword arguments passed to the `plt.plot` used to draw + the mean score. + + fill_between_kw : dict, default=None + Additional keyword arguments passed to the `plt.fill_between` used + to draw the score standard deviation. + + errorbar_kw : dict, default=None + Additional keyword arguments passed to the `plt.errorbar` used to + draw mean score and standard deviation score. + + Returns + ------- + display : :class:`~sklearn.model_selection.ValidationCurveDisplay` + Object that stores computed values. + """ + self._plot_curve( + self.param_range, + ax=ax, + negate_score=negate_score, + score_name=score_name, + score_type=score_type, + log_scale="deprecated", + std_display_style=std_display_style, + line_kw=line_kw, + fill_between_kw=fill_between_kw, + errorbar_kw=errorbar_kw, + ) + self.ax_.set_xlabel(f"{self.param_name}") + return self + + @classmethod + def from_estimator( + cls, + estimator, + X, + y, + *, + param_name, + param_range, + groups=None, + cv=None, + scoring=None, + n_jobs=None, + pre_dispatch="all", + verbose=0, + error_score=np.nan, + fit_params=None, + ax=None, + negate_score=False, + score_name=None, + score_type="both", + std_display_style="fill_between", + line_kw=None, + fill_between_kw=None, + errorbar_kw=None, + ): + """Create a validation curve display from an estimator. + + Read more in the :ref:`User Guide ` for general + information about the visualization API and :ref:`detailed + documentation ` regarding the validation curve + visualization. + + Parameters + ---------- + estimator : object type that implements the "fit" and "predict" methods + An object of that type which is cloned for each validation. + + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + Target relative to X for classification or regression; + None for unsupervised learning. + + param_name : str + Name of the parameter that will be varied. + + param_range : array-like of shape (n_values,) + The values of the parameter that will be evaluated. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`GroupKFold`). + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, if the estimator is a classifier and `y` is + either binary or multiclass, + :class:`~sklearn.model_selection.StratifiedKFold` is used. In all + other cases, :class:`~sklearn.model_selectionKFold` is used. These + splitters are instantiated with `shuffle=False` so the splits will + be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + scoring : str or callable, default=None + A string (see :ref:`scoring_parameter`) or + a scorer callable object / function with signature + `scorer(estimator, X, y)` (see :ref:`scoring`). + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and + computing the score are parallelized over the different training + and test sets. `None` means 1 unless in a + :obj:`joblib.parallel_backend` context. `-1` means using all + processors. See :term:`Glossary ` for more details. + + pre_dispatch : int or str, default='all' + Number of predispatched jobs for parallel execution (default is + all). The option can reduce the allocated memory. The str can + be an expression like '2*n_jobs'. + + verbose : int, default=0 + Controls the verbosity: the higher, the more messages. + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator + fitting. If set to 'raise', the error is raised. If a numeric value + is given, FitFailedWarning is raised. + + fit_params : dict, default=None + Parameters to pass to the fit method of the estimator. + + ax : matplotlib Axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + negate_score : bool, default=False + Whether or not to negate the scores obtained through + :func:`~sklearn.model_selection.validation_curve`. This is + particularly useful when using the error denoted by `neg_*` in + `scikit-learn`. + + score_name : str, default=None + The name of the score used to decorate the y-axis of the plot. It will + override the name inferred from the `scoring` parameter. If `score` is + `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"` + otherwise. If `scoring` is a string or a callable, we infer the name. We + replace `_` by spaces and capitalize the first letter. We remove `neg_` and + replace it by `"Negative"` if `negate_score` is + `False` or just remove it otherwise. + + score_type : {"test", "train", "both"}, default="both" + The type of score to plot. Can be one of `"test"`, `"train"`, or + `"both"`. + + std_display_style : {"errorbar", "fill_between"} or None, default="fill_between" + The style used to display the score standard deviation around the + mean score. If `None`, no representation of the standard deviation + is displayed. + + line_kw : dict, default=None + Additional keyword arguments passed to the `plt.plot` used to draw + the mean score. + + fill_between_kw : dict, default=None + Additional keyword arguments passed to the `plt.fill_between` used + to draw the score standard deviation. + + errorbar_kw : dict, default=None + Additional keyword arguments passed to the `plt.errorbar` used to + draw mean score and standard deviation score. + + Returns + ------- + display : :class:`~sklearn.model_selection.ValidationCurveDisplay` + Object that stores computed values. + + Examples + -------- + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import ValidationCurveDisplay + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = make_classification(n_samples=1_000, random_state=0) + >>> logistic_regression = LogisticRegression() + >>> param_name, param_range = "C", np.logspace(-8, 3, 10) + >>> ValidationCurveDisplay.from_estimator( + ... logistic_regression, X, y, param_name=param_name, + ... param_range=param_range, + ... ) + <...> + >>> plt.show() + """ + check_matplotlib_support(f"{cls.__name__}.from_estimator") + + score_name = _validate_score_name(score_name, scoring, negate_score) + + train_scores, test_scores = validation_curve( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + groups=groups, + cv=cv, + scoring=scoring, + n_jobs=n_jobs, + pre_dispatch=pre_dispatch, + verbose=verbose, + error_score=error_score, + fit_params=fit_params, + ) + + viz = cls( + param_name=param_name, + param_range=param_range, + train_scores=train_scores, + test_scores=test_scores, + score_name=score_name, + ) + return viz.plot( + ax=ax, + negate_score=negate_score, + score_type=score_type, + std_display_style=std_display_style, + line_kw=line_kw, + fill_between_kw=fill_between_kw, + errorbar_kw=errorbar_kw, + ) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 1621dd324f81c..695614f4e1fa0 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -26,6 +26,7 @@ from ..base import BaseEstimator, is_classifier, clone from ..base import MetaEstimatorMixin +from ..base import _fit_context from ._split import check_cv from ._validation import _fit_and_score from ._validation import _aggregate_score_dicts @@ -753,6 +754,10 @@ def _select_best_index(refit, refit_metric, results): best_index = results[f"rank_test_{refit_metric}"].argmin() return best_index + @_fit_context( + # *SearchCV.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None, *, groups=None, **fit_params): """Run fit with all sets of parameters. @@ -786,7 +791,6 @@ def fit(self, X, y=None, *, groups=None, **fit_params): self : object Instance of fitted estimator. """ - self._validate_params() estimator = self.estimator refit_metric = "score" diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 4826e7931d4d6..a061d7283b46d 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -7,6 +7,7 @@ from ._search import BaseSearchCV from . import ParameterGrid, ParameterSampler from ..base import is_classifier +from ..base import _fit_context from ._split import check_cv, _yields_constant_splits from ..metrics._scorer import get_scorer_names from ..utils import resample @@ -211,6 +212,10 @@ def _select_best_index(refit, refit_metric, results): return last_iter_indices[best_idx] + @_fit_context( + # Halving*SearchCV.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None, groups=None, **fit_params): """Run fit with all sets of parameters. @@ -238,7 +243,6 @@ def fit(self, X, y=None, groups=None, **fit_params): self : object Instance of fitted estimator. """ - self._validate_params() self._checked_cv_orig = check_cv( self.cv, y, classifier=is_classifier(self.estimator) ) diff --git a/sklearn/model_selection/tests/test_plot.py b/sklearn/model_selection/tests/test_plot.py index 762af8fe08336..6baa211d2dc6e 100644 --- a/sklearn/model_selection/tests/test_plot.py +++ b/sklearn/model_selection/tests/test_plot.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from sklearn.datasets import load_iris @@ -5,8 +6,8 @@ from sklearn.utils import shuffle from sklearn.utils._testing import assert_allclose, assert_array_equal -from sklearn.model_selection import learning_curve -from sklearn.model_selection import LearningCurveDisplay +from sklearn.model_selection import learning_curve, validation_curve +from sklearn.model_selection import LearningCurveDisplay, ValidationCurveDisplay @pytest.fixture @@ -21,18 +22,22 @@ def data(): ({"score_type": "invalid"}, ValueError, "Unknown score_type:"), ], ) -def test_learning_curve_display_parameters_validation( - pyplot, data, params, err_type, err_msg +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_parameters_validation( + pyplot, data, params, err_type, err_msg, CurveDisplay, specific_params ): """Check that we raise a proper error when passing invalid parameters.""" X, y = data estimator = DecisionTreeClassifier(random_state=0) - train_sizes = [0.3, 0.6, 0.9] with pytest.raises(err_type, match=err_msg): - LearningCurveDisplay.from_estimator( - estimator, X, y, train_sizes=train_sizes, **params - ) + CurveDisplay.from_estimator(estimator, X, y, **specific_params, **params) def test_learning_curve_display_default_usage(pyplot, data): @@ -63,7 +68,7 @@ def test_learning_curve_display_default_usage(pyplot, data): assert display.ax_.get_ylabel() == "Score" _, legend_labels = display.ax_.get_legend_handles_labels() - assert legend_labels == ["Testing metric"] + assert legend_labels == ["Train", "Test"] train_sizes_abs, train_scores, test_scores = learning_curve( estimator, X, y, train_sizes=train_sizes @@ -74,21 +79,63 @@ def test_learning_curve_display_default_usage(pyplot, data): assert_allclose(display.test_scores, test_scores) -def test_learning_curve_display_negate_score(pyplot, data): +def test_validation_curve_display_default_usage(pyplot, data): + """Check the default usage of the ValidationCurveDisplay class.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + param_name, param_range = "max_depth", [1, 3, 5] + display = ValidationCurveDisplay.from_estimator( + estimator, X, y, param_name=param_name, param_range=param_range + ) + + import matplotlib as mpl + + assert display.errorbar_ is None + + assert isinstance(display.lines_, list) + for line in display.lines_: + assert isinstance(line, mpl.lines.Line2D) + + assert isinstance(display.fill_between_, list) + for fill in display.fill_between_: + assert isinstance(fill, mpl.collections.PolyCollection) + assert fill.get_alpha() == 0.5 + + assert display.score_name == "Score" + assert display.ax_.get_xlabel() == f"{param_name}" + assert display.ax_.get_ylabel() == "Score" + + _, legend_labels = display.ax_.get_legend_handles_labels() + assert legend_labels == ["Train", "Test"] + + train_scores, test_scores = validation_curve( + estimator, X, y, param_name=param_name, param_range=param_range + ) + + assert display.param_range == param_range + assert_array_equal(display.param_range, param_range) + assert_allclose(display.train_scores, train_scores) + assert_allclose(display.test_scores, test_scores) + + +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_negate_score(pyplot, data, CurveDisplay, specific_params): """Check the behaviour of the `negate_score` parameter calling `from_estimator` and `plot`. """ X, y = data estimator = DecisionTreeClassifier(max_depth=1, random_state=0) - train_sizes = [0.3, 0.6, 0.9] negate_score = False - display = LearningCurveDisplay.from_estimator( - estimator, - X, - y, - train_sizes=train_sizes, - negate_score=negate_score, + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, negate_score=negate_score ) positive_scores = display.lines_[0].get_data()[1] @@ -96,22 +143,18 @@ def test_learning_curve_display_negate_score(pyplot, data): assert display.ax_.get_ylabel() == "Score" negate_score = True - display = LearningCurveDisplay.from_estimator( - estimator, X, y, train_sizes=train_sizes, negate_score=negate_score + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, negate_score=negate_score ) negative_scores = display.lines_[0].get_data()[1] assert (negative_scores <= 0).all() assert_allclose(negative_scores, -positive_scores) - assert display.ax_.get_ylabel() == "Score" + assert display.ax_.get_ylabel() == "Negative score" negate_score = False - display = LearningCurveDisplay.from_estimator( - estimator, - X, - y, - train_sizes=train_sizes, - negate_score=negate_score, + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, negate_score=negate_score ) assert display.ax_.get_ylabel() == "Score" display.plot(negate_score=not negate_score) @@ -122,23 +165,30 @@ def test_learning_curve_display_negate_score(pyplot, data): @pytest.mark.parametrize( "score_name, ylabel", [(None, "Score"), ("Accuracy", "Accuracy")] ) -def test_learning_curve_display_score_name(pyplot, data, score_name, ylabel): +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_score_name( + pyplot, data, score_name, ylabel, CurveDisplay, specific_params +): """Check that we can overwrite the default score name shown on the y-axis.""" X, y = data estimator = DecisionTreeClassifier(random_state=0) - train_sizes = [0.3, 0.6, 0.9] - display = LearningCurveDisplay.from_estimator( - estimator, X, y, train_sizes=train_sizes, score_name=score_name + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, score_name=score_name ) assert display.ax_.get_ylabel() == ylabel X, y = data estimator = DecisionTreeClassifier(max_depth=1, random_state=0) - train_sizes = [0.3, 0.6, 0.9] - display = LearningCurveDisplay.from_estimator( - estimator, X, y, train_sizes=train_sizes, score_name=score_name + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, score_name=score_name ) assert display.score_name == ylabel @@ -166,7 +216,7 @@ def test_learning_curve_display_score_type(pyplot, data, std_display_style): ) _, legend_label = display.ax_.get_legend_handles_labels() - assert legend_label == ["Training metric"] + assert legend_label == ["Train"] if std_display_style is None: assert len(display.lines_) == 1 @@ -191,7 +241,7 @@ def test_learning_curve_display_score_type(pyplot, data, std_display_style): ) _, legend_label = display.ax_.get_legend_handles_labels() - assert legend_label == ["Testing metric"] + assert legend_label == ["Test"] if std_display_style is None: assert len(display.lines_) == 1 @@ -216,7 +266,7 @@ def test_learning_curve_display_score_type(pyplot, data, std_display_style): ) _, legend_label = display.ax_.get_legend_handles_labels() - assert legend_label == ["Training metric", "Testing metric"] + assert legend_label == ["Train", "Test"] if std_display_style is None: assert len(display.lines_) == 2 @@ -235,100 +285,220 @@ def test_learning_curve_display_score_type(pyplot, data, std_display_style): assert_allclose(y_data_test, test_scores.mean(axis=1)) -def test_learning_curve_display_log_scale(pyplot, data): - """Check the behaviour of the parameter `log_scale`.""" +@pytest.mark.parametrize("std_display_style", (None, "errorbar")) +def test_validation_curve_display_score_type(pyplot, data, std_display_style): + """Check the behaviour of setting the `score_type` parameter.""" X, y = data estimator = DecisionTreeClassifier(random_state=0) - train_sizes = [0.3, 0.6, 0.9] - display = LearningCurveDisplay.from_estimator( - estimator, X, y, train_sizes=train_sizes, log_scale=True + param_name, param_range = "max_depth", [1, 3, 5] + train_scores, test_scores = validation_curve( + estimator, X, y, param_name=param_name, param_range=param_range ) - assert display.ax_.get_xscale() == "log" - assert display.ax_.get_yscale() == "linear" + score_type = "train" + display = ValidationCurveDisplay.from_estimator( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + score_type=score_type, + std_display_style=std_display_style, + ) - display = LearningCurveDisplay.from_estimator( - estimator, X, y, train_sizes=train_sizes, log_scale=False + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Train"] + + if std_display_style is None: + assert len(display.lines_) == 1 + assert display.errorbar_ is None + x_data, y_data = display.lines_[0].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 1 + x_data, y_data = display.errorbar_[0].lines[0].get_data() + + assert_array_equal(x_data, param_range) + assert_allclose(y_data, train_scores.mean(axis=1)) + + score_type = "test" + display = ValidationCurveDisplay.from_estimator( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + score_type=score_type, + std_display_style=std_display_style, ) - assert display.ax_.get_xscale() == "linear" - assert display.ax_.get_yscale() == "linear" + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Test"] + + if std_display_style is None: + assert len(display.lines_) == 1 + assert display.errorbar_ is None + x_data, y_data = display.lines_[0].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 1 + x_data, y_data = display.errorbar_[0].lines[0].get_data() + + assert_array_equal(x_data, param_range) + assert_allclose(y_data, test_scores.mean(axis=1)) + + score_type = "both" + display = ValidationCurveDisplay.from_estimator( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + score_type=score_type, + std_display_style=std_display_style, + ) + + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Train", "Test"] + + if std_display_style is None: + assert len(display.lines_) == 2 + assert display.errorbar_ is None + x_data_train, y_data_train = display.lines_[0].get_data() + x_data_test, y_data_test = display.lines_[1].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 2 + x_data_train, y_data_train = display.errorbar_[0].lines[0].get_data() + x_data_test, y_data_test = display.errorbar_[1].lines[0].get_data() + + assert_array_equal(x_data_train, param_range) + assert_allclose(y_data_train, train_scores.mean(axis=1)) + assert_array_equal(x_data_test, param_range) + assert_allclose(y_data_test, test_scores.mean(axis=1)) + + +@pytest.mark.parametrize( + "CurveDisplay, specific_params, expected_xscale", + [ + ( + ValidationCurveDisplay, + {"param_name": "max_depth", "param_range": np.arange(1, 5)}, + "linear", + ), + (LearningCurveDisplay, {"train_sizes": np.linspace(0.1, 0.9, num=5)}, "linear"), + ( + ValidationCurveDisplay, + { + "param_name": "max_depth", + "param_range": np.round(np.logspace(0, 2, num=5)).astype(np.int64), + }, + "log", + ), + (LearningCurveDisplay, {"train_sizes": np.logspace(-1, 0, num=5)}, "log"), + ], +) +def test_curve_display_xscale_auto( + pyplot, data, CurveDisplay, specific_params, expected_xscale +): + """Check the behaviour of the x-axis scaling depending on the data provided.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + display = CurveDisplay.from_estimator(estimator, X, y, **specific_params) + assert display.ax_.get_xscale() == expected_xscale -def test_learning_curve_display_std_display_style(pyplot, data): + +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_std_display_style(pyplot, data, CurveDisplay, specific_params): """Check the behaviour of the parameter `std_display_style`.""" X, y = data estimator = DecisionTreeClassifier(random_state=0) import matplotlib as mpl - train_sizes = [0.3, 0.6, 0.9] std_display_style = None - display = LearningCurveDisplay.from_estimator( + display = CurveDisplay.from_estimator( estimator, X, y, - train_sizes=train_sizes, + **specific_params, std_display_style=std_display_style, ) - assert len(display.lines_) == 1 - assert isinstance(display.lines_[0], mpl.lines.Line2D) + assert len(display.lines_) == 2 + for line in display.lines_: + assert isinstance(line, mpl.lines.Line2D) assert display.errorbar_ is None assert display.fill_between_ is None _, legend_label = display.ax_.get_legend_handles_labels() - assert len(legend_label) == 1 + assert len(legend_label) == 2 std_display_style = "fill_between" - display = LearningCurveDisplay.from_estimator( + display = CurveDisplay.from_estimator( estimator, X, y, - train_sizes=train_sizes, + **specific_params, std_display_style=std_display_style, ) - assert len(display.lines_) == 1 - assert isinstance(display.lines_[0], mpl.lines.Line2D) + assert len(display.lines_) == 2 + for line in display.lines_: + assert isinstance(line, mpl.lines.Line2D) assert display.errorbar_ is None - assert len(display.fill_between_) == 1 - assert isinstance(display.fill_between_[0], mpl.collections.PolyCollection) + assert len(display.fill_between_) == 2 + for fill_between in display.fill_between_: + assert isinstance(fill_between, mpl.collections.PolyCollection) _, legend_label = display.ax_.get_legend_handles_labels() - assert len(legend_label) == 1 + assert len(legend_label) == 2 std_display_style = "errorbar" - display = LearningCurveDisplay.from_estimator( + display = CurveDisplay.from_estimator( estimator, X, y, - train_sizes=train_sizes, + **specific_params, std_display_style=std_display_style, ) assert display.lines_ is None - assert len(display.errorbar_) == 1 - assert isinstance(display.errorbar_[0], mpl.container.ErrorbarContainer) + assert len(display.errorbar_) == 2 + for errorbar in display.errorbar_: + assert isinstance(errorbar, mpl.container.ErrorbarContainer) assert display.fill_between_ is None _, legend_label = display.ax_.get_legend_handles_labels() - assert len(legend_label) == 1 + assert len(legend_label) == 2 -def test_learning_curve_display_plot_kwargs(pyplot, data): +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_plot_kwargs(pyplot, data, CurveDisplay, specific_params): """Check the behaviour of the different plotting keyword arguments: `line_kw`, `fill_between_kw`, and `errorbar_kw`.""" X, y = data estimator = DecisionTreeClassifier(random_state=0) - train_sizes = [0.3, 0.6, 0.9] std_display_style = "fill_between" line_kw = {"color": "red"} fill_between_kw = {"color": "red", "alpha": 1.0} - display = LearningCurveDisplay.from_estimator( + display = CurveDisplay.from_estimator( estimator, X, y, - train_sizes=train_sizes, + **specific_params, std_display_style=std_display_style, line_kw=line_kw, fill_between_kw=fill_between_kw, @@ -342,13 +512,36 @@ def test_learning_curve_display_plot_kwargs(pyplot, data): std_display_style = "errorbar" errorbar_kw = {"color": "red"} - display = LearningCurveDisplay.from_estimator( + display = CurveDisplay.from_estimator( estimator, X, y, - train_sizes=train_sizes, + **specific_params, std_display_style=std_display_style, errorbar_kw=errorbar_kw, ) assert display.errorbar_[0].lines[0].get_color() == "red" + + +# TODO(1.5): to be removed +def test_learning_curve_display_deprecate_log_scale(data, pyplot): + """Check that we warn for the deprecated parameter `log_scale`.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + with pytest.warns(FutureWarning, match="`log_scale` parameter is deprecated"): + display = LearningCurveDisplay.from_estimator( + estimator, X, y, train_sizes=[0.3, 0.6, 0.9], log_scale=True + ) + + assert display.ax_.get_xscale() == "log" + assert display.ax_.get_yscale() == "linear" + + with pytest.warns(FutureWarning, match="`log_scale` parameter is deprecated"): + display = LearningCurveDisplay.from_estimator( + estimator, X, y, train_sizes=[0.3, 0.6, 0.9], log_scale=False + ) + + assert display.ax_.get_xscale() == "linear" + assert display.ax_.get_yscale() == "linear" diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 74684e608d3c1..4c30bcdb6cac3 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -43,6 +43,7 @@ from .base import BaseEstimator, ClassifierMixin, clone, is_classifier from .base import MultiOutputMixin from .base import MetaEstimatorMixin, is_regressor +from .base import _fit_context from .preprocessing import LabelBinarizer from .metrics.pairwise import pairwise_distances_argmin from .utils import check_random_state @@ -296,6 +297,10 @@ def __init__(self, estimator, *, n_jobs=None, verbose=0): self.n_jobs = n_jobs self.verbose = verbose + @_fit_context( + # OneVsRestClassifier.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """Fit underlying estimators. @@ -313,8 +318,6 @@ def fit(self, X, y): self : object Instance of fitted estimator. """ - self._validate_params() - # A sparse LabelBinarizer, with sparse_output=True, has been shown to # outperform or match a dense label binarizer in all cases and has also # resulted in less or equal memory consumption in the fit_ovr function @@ -348,6 +351,10 @@ def fit(self, X, y): return self @available_if(_estimators_has("partial_fit")) + @_fit_context( + # OneVsRestClassifier.estimator is not validated yet + prefer_skip_nested_validation=False + ) def partial_fit(self, X, y, classes=None): """Partially fit underlying estimators. @@ -376,8 +383,6 @@ def partial_fit(self, X, y, classes=None): Instance of partially fitted estimator. """ if _check_partial_fit_first_call(self, classes): - self._validate_params() - if not hasattr(self.estimator, "partial_fit"): raise ValueError( ("Base estimator {0}, doesn't have partial_fit method").format( @@ -655,6 +660,10 @@ def __init__(self, estimator, *, n_jobs=None): self.estimator = estimator self.n_jobs = n_jobs + @_fit_context( + # OneVsOneClassifier.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """Fit underlying estimators. @@ -671,7 +680,6 @@ def fit(self, X, y): self : object The fitted underlying estimator. """ - self._validate_params() # We need to validate the data because we do a safe_indexing later. X, y = self._validate_data( X, y, accept_sparse=["csr", "csc"], force_all_finite=False @@ -706,6 +714,10 @@ def fit(self, X, y): return self @available_if(_estimators_has("partial_fit")) + @_fit_context( + # OneVsOneClassifier.estimator is not validated yet + prefer_skip_nested_validation=False + ) def partial_fit(self, X, y, classes=None): """Partially fit underlying estimators. @@ -735,8 +747,6 @@ def partial_fit(self, X, y, classes=None): """ first_call = _check_partial_fit_first_call(self, classes) if first_call: - self._validate_params() - self.estimators_ = [ clone(self.estimator) for _ in range(self.n_classes_ * (self.n_classes_ - 1) // 2) @@ -968,6 +978,10 @@ def __init__(self, estimator, *, code_size=1.5, random_state=None, n_jobs=None): self.random_state = random_state self.n_jobs = n_jobs + @_fit_context( + # OutputCodeClassifier.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """Fit underlying estimators. @@ -984,7 +998,6 @@ def fit(self, X, y): self : object Returns a fitted instance of self. """ - self._validate_params() y = self._validate_data(X="no_validation", y=y) random_state = check_random_state(self.random_state) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 90c1f04f7e46a..8bb954e976f4c 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -28,6 +28,7 @@ RegressorMixin, clone, is_classifier, + _fit_context, ) from .model_selection import cross_val_predict from .utils import _print_elapsed_time, check_random_state, Bunch @@ -104,6 +105,10 @@ def __init__(self, estimator, *, n_jobs=None): self.n_jobs = n_jobs @_available_if_estimator_has("partial_fit") + @_fit_context( + # MultiOutput*.estimator is not validated yet + prefer_skip_nested_validation=False + ) def partial_fit(self, X, y, classes=None, sample_weight=None, **partial_fit_params): """Incrementally fit a separate model for each class output. @@ -151,9 +156,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None, **partial_fit_para first_time = not hasattr(self, "estimators_") - if first_time: - self._validate_params() - y = self._validate_data(X="no_validation", y=y, multi_output=True) if y.ndim == 1: @@ -203,6 +205,10 @@ def partial_fit(self, X, y, classes=None, sample_weight=None, **partial_fit_para return self + @_fit_context( + # MultiOutput*.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None, **fit_params): """Fit the model to data, separately for each output variable. @@ -230,8 +236,6 @@ def fit(self, X, y, sample_weight=None, **fit_params): self : object Returns a fitted instance. """ - self._validate_params() - if not hasattr(self.estimator, "fit"): raise ValueError("The base estimator should implement a fit method") @@ -887,6 +891,10 @@ class labels for each estimator in the chain. [0.0321..., 0.9935..., 0.0625...]]) """ + @_fit_context( + # ClassifierChain.base_estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, Y, **fit_params): """Fit the model to data matrix X and targets Y. @@ -917,8 +925,6 @@ def fit(self, X, Y, **fit_params): "See the User Guide for more information." ) - self._validate_params() - super().fit(X, Y, **fit_params) self.classes_ = [ estimator.classes_ for chain_idx, estimator in enumerate(self.estimators_) @@ -1109,6 +1115,10 @@ class RegressorChain(MetaEstimatorMixin, RegressorMixin, _BaseChain): [2., 0.]]) """ + @_fit_context( + # RegressorChain.base_estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, Y, **fit_params): """Fit the model to data matrix X and targets Y. @@ -1131,8 +1141,6 @@ def fit(self, X, Y, **fit_params): self : object Returns a fitted instance. """ - self._validate_params() - super().fit(X, Y, **fit_params) return self diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 20858ac8b5577..76d7189385828 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -22,6 +22,7 @@ from scipy.special import logsumexp from .base import BaseEstimator, ClassifierMixin +from .base import _fit_context from .preprocessing import binarize from .preprocessing import LabelBinarizer from .preprocessing import label_binarize @@ -239,6 +240,7 @@ def __init__(self, *, priors=None, var_smoothing=1e-9): self.priors = priors self.var_smoothing = var_smoothing + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit Gaussian Naive Bayes according to X, y. @@ -262,7 +264,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() y = self._validate_data(y=y) return self._partial_fit( X, y, np.unique(y), _refit=True, sample_weight=sample_weight @@ -346,6 +347,7 @@ def _update_mean_variance(n_past, mu, var, X, sample_weight=None): return total_mu, total_var + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y, classes=None, sample_weight=None): """Incremental fit on a batch of samples. @@ -386,8 +388,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() - return self._partial_fit( X, y, classes, _refit=False, sample_weight=sample_weight ) @@ -643,6 +643,7 @@ def _check_alpha(self): return np.maximum(alpha, alpha_lower_bound) return alpha + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y, classes=None, sample_weight=None): """Incremental fit on a batch of samples. @@ -682,9 +683,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): """ first_call = not hasattr(self, "classes_") - if first_call: - self._validate_params() - X, y = self._check_X_y(X, y, reset=first_call) _, n_features = X.shape @@ -728,6 +726,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): self._update_class_log_prior(class_prior=class_prior) return self + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit Naive Bayes classifier according to X, y. @@ -748,7 +747,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() X, y = self._check_X_y(X, y) _, n_features = X.shape diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index dbc070987d5d0..e3e2049a8f8e5 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -18,6 +18,7 @@ from ._base import _get_weights from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin from ..base import ClassifierMixin +from ..base import _fit_context from ..metrics._pairwise_distances_reduction import ArgKminClassMode from ..utils._param_validation import StrOptions from sklearn.neighbors._base import _check_precomputed @@ -203,6 +204,10 @@ def __init__( ) self.weights = weights + @_fit_context( + # KNeighborsClassifier.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """Fit the k-nearest neighbors classifier from the training dataset. @@ -221,8 +226,6 @@ def fit(self, X, y): self : KNeighborsClassifier The fitted k-nearest neighbors classifier. """ - self._validate_params() - return self._fit(X, y) def predict(self, X): @@ -572,6 +575,10 @@ def __init__( self.weights = weights self.outlier_label = outlier_label + @_fit_context( + # RadiusNeighborsClassifier.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """Fit the radius neighbors classifier from the training dataset. @@ -590,7 +597,6 @@ def fit(self, X, y): self : RadiusNeighborsClassifier The fitted radius neighbors classifier. """ - self._validate_params() self._fit(X, y) classes_ = self.classes_ diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py index 418761c2d21ee..e815d12e293c9 100644 --- a/sklearn/neighbors/_graph.py +++ b/sklearn/neighbors/_graph.py @@ -8,6 +8,7 @@ from ._base import NeighborsBase from ._unsupervised import NearestNeighbors from ..base import TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..utils._param_validation import StrOptions from ..utils.validation import check_is_fitted @@ -372,6 +373,10 @@ def __init__( ) self.mode = mode + @_fit_context( + # KNeighborsTransformer.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None): """Fit the k-nearest neighbors transformer from the training dataset. @@ -388,7 +393,6 @@ def fit(self, X, y=None): self : KNeighborsTransformer The fitted k-nearest neighbors transformer. """ - self._validate_params() self._fit(X) self._n_features_out = self.n_samples_fit_ return self @@ -600,6 +604,10 @@ def __init__( ) self.mode = mode + @_fit_context( + # RadiusNeighborsTransformer.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None): """Fit the radius neighbors transformer from the training dataset. @@ -617,7 +625,6 @@ def fit(self, X, y=None): self : RadiusNeighborsTransformer The fitted radius neighbors transformer. """ - self._validate_params() self._fit(X) self._n_features_out = self.n_samples_fit_ return self diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index f285b03403b5f..7f7b38497d209 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -10,6 +10,7 @@ from scipy.special import gammainc from ..base import BaseEstimator +from ..base import _fit_context from ..neighbors._base import VALID_METRICS from ..utils import check_random_state from ..utils.validation import _check_sample_weight, check_is_fitted @@ -185,6 +186,10 @@ def _choose_algorithm(self, algorithm, metric): ) return algorithm + @_fit_context( + # KernelDensity.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None, sample_weight=None): """Fit the Kernel Density model on the data. @@ -208,8 +213,6 @@ def fit(self, X, y=None, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() - algorithm = self._choose_algorithm(self.algorithm, self.metric) if isinstance(self.bandwidth, str): diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py index 90b3b0aa3d8ce..40cdc9ab5fb9d 100644 --- a/sklearn/neighbors/_lof.py +++ b/sklearn/neighbors/_lof.py @@ -8,6 +8,7 @@ from ._base import NeighborsBase from ._base import KNeighborsMixin from ..base import OutlierMixin +from ..base import _fit_context from numbers import Real from ..utils._param_validation import Interval, StrOptions @@ -256,6 +257,10 @@ def fit_predict(self, X, y=None): return self.fit(X)._predict() + @_fit_context( + # LocalOutlierFactor.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None): """Fit the local outlier factor detector from the training dataset. @@ -273,8 +278,6 @@ def fit(self, X, y=None): self : LocalOutlierFactor The fitted local outlier factor detector. """ - self._validate_params() - self._fit(X) n_samples = self.n_samples_fit_ diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 4a83fcc7bc080..246f0adcb36ad 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -15,6 +15,7 @@ from ..utils.extmath import softmax from ..metrics import pairwise_distances from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..preprocessing import LabelEncoder from ..decomposition import PCA from ..utils.multiclass import check_classification_targets @@ -215,6 +216,7 @@ def __init__( self.verbose = verbose self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit the model according to the given training data. @@ -231,8 +233,6 @@ def fit(self, X, y): self : object Fitted estimator. """ - self._validate_params() - # Validate the inputs X and y, and converts y to numerical classes. X, y = self._validate_data(X, y, ensure_min_samples=2) check_classification_targets(y) diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py index 7b9c2479747d3..315393bf597e4 100644 --- a/sklearn/neighbors/_nearest_centroid.py +++ b/sklearn/neighbors/_nearest_centroid.py @@ -13,6 +13,7 @@ from scipy import sparse as sp from ..base import BaseEstimator, ClassifierMixin +from ..base import _fit_context from ..metrics.pairwise import pairwise_distances_argmin from ..preprocessing import LabelEncoder from ..utils.validation import check_is_fitted @@ -122,6 +123,7 @@ def __init__(self, metric="euclidean", *, shrink_threshold=None): self.metric = metric self.shrink_threshold = shrink_threshold + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """ Fit the NearestCentroid model according to the given training data. @@ -140,8 +142,6 @@ def fit(self, X, y): self : object Fitted estimator. """ - self._validate_params() - if isinstance(self.metric, str) and self.metric not in ( "manhattan", "euclidean", diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index 003b534074ecd..b2050345c9833 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -17,6 +17,7 @@ from ._base import _get_weights from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin from ..base import RegressorMixin +from ..base import _fit_context from ..utils._param_validation import StrOptions @@ -194,6 +195,10 @@ def _more_tags(self): # For cross-validation routines to split data correctly return {"pairwise": self.metric == "precomputed"} + @_fit_context( + # KNeighborsRegressor.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """Fit the k-nearest neighbors regressor from the training dataset. @@ -212,8 +217,6 @@ def fit(self, X, y): self : KNeighborsRegressor The fitted k-nearest neighbors regressor. """ - self._validate_params() - return self._fit(X, y) def predict(self, X): @@ -422,6 +425,10 @@ def __init__( ) self.weights = weights + @_fit_context( + # RadiusNeighborsRegressor.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """Fit the radius neighbors regressor from the training dataset. @@ -440,7 +447,6 @@ def fit(self, X, y): self : RadiusNeighborsRegressor The fitted radius neighbors regressor. """ - self._validate_params() return self._fit(X, y) def predict(self, X): diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py index 53e69495b9ed4..05607f0bd0c71 100644 --- a/sklearn/neighbors/_unsupervised.py +++ b/sklearn/neighbors/_unsupervised.py @@ -1,4 +1,5 @@ """Unsupervised nearest neighbors learner""" +from ..base import _fit_context from ._base import NeighborsBase from ._base import KNeighborsMixin from ._base import RadiusNeighborsMixin @@ -155,6 +156,10 @@ def __init__( n_jobs=n_jobs, ) + @_fit_context( + # NearestNeighbors.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None): """Fit the nearest neighbors estimator from the training dataset. @@ -172,5 +177,4 @@ def fit(self, X, y=None): self : NearestNeighbors The fitted nearest neighbors estimator. """ - self._validate_params() return self._fit(X) diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 5c4bc5a39aa2d..fb8eab2f1776d 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -21,6 +21,7 @@ RegressorMixin, ) from ..base import is_classifier +from ..base import _fit_context from ._base import ACTIVATIONS, DERIVATIVES, LOSS_FUNCTIONS from ._stochastic_optimizers import SGDOptimizer, AdamOptimizer from ..metrics import accuracy_score, r2_score @@ -727,6 +728,7 @@ def _update_no_improvement_count(self, early_stopping, X_val, y_val): if self.loss_curve_[-1] < self.best_loss_: self.best_loss_ = self.loss_curve_[-1] + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit the model to data matrix X and target(s) y. @@ -744,8 +746,6 @@ def fit(self, X, y): self : object Returns a trained MLP model. """ - self._validate_params() - return self._fit(X, y, incremental=False) def _check_solver(self): @@ -1170,6 +1170,7 @@ def _score(self, X, y): return accuracy_score(y, self._predict(X, check_input=False)) @available_if(lambda est: est._check_solver()) + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y, classes=None): """Update the model with a single iteration over the given data. @@ -1194,9 +1195,6 @@ def partial_fit(self, X, y, classes=None): self : object Trained MLP model. """ - if not hasattr(self, "coefs_"): - self._validate_params() - if _check_partial_fit_first_call(self, classes): self._label_binarizer = LabelBinarizer() if type_of_target(y).startswith("multilabel"): @@ -1624,6 +1622,7 @@ def _validate_input(self, X, y, incremental, reset): return X, y @available_if(lambda est: est._check_solver) + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y): """Update the model with a single iteration over the given data. @@ -1640,7 +1639,4 @@ def partial_fit(self, X, y): self : object Trained MLP model. """ - if not hasattr(self, "coefs_"): - self._validate_params() - return self._fit(X, y, incremental=True) diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index 0624145116180..2ded6533d8d96 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -17,6 +17,7 @@ from ..base import BaseEstimator from ..base import TransformerMixin from ..base import ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..utils import check_random_state from ..utils import gen_even_slices from ..utils.extmath import safe_sparse_dot @@ -269,6 +270,7 @@ def gibbs(self, v): return v_ + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None): """Fit the model to the partial segment of the data X. @@ -285,9 +287,6 @@ def partial_fit(self, X, y=None): self : BernoulliRBM The fitted model. """ - - self._validate_params() - first_pass = not hasattr(self, "components_") X = self._validate_data( X, accept_sparse="csr", dtype=np.float64, reset=first_pass @@ -380,6 +379,7 @@ def score_samples(self, X): fe_ = self._free_energy(v_) return v.shape[1] * log_logistic(fe_ - fe) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model to the data X. @@ -396,9 +396,6 @@ def fit(self, X, y=None): self : BernoulliRBM The fitted model. """ - - self._validate_params() - X = self._validate_data(X, accept_sparse="csr", dtype=(np.float64, np.float32)) n_samples = X.shape[0] rng = check_random_state(self.random_state) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 8c5dc3bd82917..43b6b7eb0c939 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -16,6 +16,7 @@ from scipy import sparse from .base import clone, TransformerMixin +from .base import _fit_context from .preprocessing import FunctionTransformer from .utils._estimator_html_repr import _VisualBlock from .utils.metaestimators import available_if @@ -385,6 +386,10 @@ def _fit(self, X, y=None, **fit_params_steps): self.steps[step_idx] = (name, fitted_transformer) return X + @_fit_context( + # estimators in Pipeline.steps are not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None, **fit_params): """Fit the model. @@ -411,7 +416,6 @@ def fit(self, X, y=None, **fit_params): self : object Pipeline with fitted steps. """ - self._validate_params() fit_params_steps = self._check_fit_params(**fit_params) Xt = self._fit(X, y, **fit_params_steps) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): @@ -429,6 +433,10 @@ def _can_fit_transform(self): ) @available_if(_can_fit_transform) + @_fit_context( + # estimators in Pipeline.steps are not validated yet + prefer_skip_nested_validation=False + ) def fit_transform(self, X, y=None, **fit_params): """Fit the model and transform with the final estimator. @@ -456,7 +464,6 @@ def fit_transform(self, X, y=None, **fit_params): Xt : ndarray of shape (n_samples, n_transformed_features) Transformed samples. """ - self._validate_params() fit_params_steps = self._check_fit_params(**fit_params) Xt = self._fit(X, y, **fit_params_steps) @@ -505,6 +512,10 @@ def predict(self, X, **predict_params): return self.steps[-1][1].predict(Xt, **predict_params) @available_if(_final_estimator_has("fit_predict")) + @_fit_context( + # estimators in Pipeline.steps are not validated yet + prefer_skip_nested_validation=False + ) def fit_predict(self, X, y=None, **fit_params): """Transform the data, and apply `fit_predict` with the final estimator. @@ -533,7 +544,6 @@ def fit_predict(self, X, y=None, **fit_params): y_pred : ndarray Result of calling `fit_predict` on the final estimator. """ - self._validate_params() fit_params_steps = self._check_fit_params(**fit_params) Xt = self._fit(X, y, **fit_params_steps) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 013f1f57e9373..139022a9897e6 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -22,6 +22,7 @@ TransformerMixin, OneToOneFeatureMixin, ClassNamePrefixFeaturesOutMixin, + _fit_context, ) from ..utils import check_array from ..utils._param_validation import Interval, Options, StrOptions, validate_params @@ -435,6 +436,7 @@ def fit(self, X, y=None): self._reset() return self.partial_fit(X, y) + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None): """Online computation of min and max on X for later scaling. @@ -456,8 +458,6 @@ def partial_fit(self, X, y=None): self : object Fitted scaler. """ - self._validate_params() - feature_range = self.feature_range if feature_range[0] >= feature_range[1]: raise ValueError( @@ -838,6 +838,7 @@ def fit(self, X, y=None, sample_weight=None): self._reset() return self.partial_fit(X, y, sample_weight) + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None, sample_weight=None): """Online computation of mean and std on X for later scaling. @@ -870,8 +871,6 @@ def partial_fit(self, X, y=None, sample_weight=None): self : object Fitted scaler. """ - self._validate_params() - first_call = not hasattr(self, "n_samples_seen_") X = self._validate_data( X, @@ -1183,6 +1182,7 @@ def fit(self, X, y=None): self._reset() return self.partial_fit(X, y) + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None): """Online computation of max absolute value of X for later scaling. @@ -1204,8 +1204,6 @@ def partial_fit(self, X, y=None): self : object Fitted scaler. """ - self._validate_params() - first_pass = not hasattr(self, "n_samples_seen_") X = self._validate_data( X, @@ -1514,6 +1512,7 @@ def __init__( self.unit_variance = unit_variance self.copy = copy + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Compute the median and quantiles to be used for scaling. @@ -1531,8 +1530,6 @@ def fit(self, X, y=None): self : object Fitted scaler. """ - self._validate_params() - # at fit, convert sparse matrices to csc for optimized computation of # the quantiles X = self._validate_data( @@ -1972,6 +1969,7 @@ def __init__(self, norm="l2", *, copy=True): self.norm = norm self.copy = copy + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Only validates estimator's parameters. @@ -1991,7 +1989,6 @@ def fit(self, X, y=None): self : object Fitted transformer. """ - self._validate_params() self._validate_data(X, accept_sparse="csr") return self @@ -2155,6 +2152,7 @@ def __init__(self, *, threshold=0.0, copy=True): self.threshold = threshold self.copy = copy + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Only validates estimator's parameters. @@ -2174,7 +2172,6 @@ def fit(self, X, y=None): self : object Fitted transformer. """ - self._validate_params() self._validate_data(X, accept_sparse="csr") return self @@ -2634,6 +2631,7 @@ def _sparse_fit(self, X, random_state): # https://github.com/numpy/numpy/issues/14685 self.quantiles_ = np.maximum.accumulate(self.quantiles_) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Compute the quantiles used for transforming. @@ -2653,8 +2651,6 @@ def fit(self, X, y=None): self : object Fitted transformer. """ - self._validate_params() - if self.n_quantiles > self.subsample: raise ValueError( "The number of quantiles cannot be greater than" @@ -3101,6 +3097,7 @@ def __init__(self, method="yeo-johnson", *, standardize=True, copy=True): self.standardize = standardize self.copy = copy + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Estimate the optimal parameter lambda for each feature. @@ -3120,10 +3117,10 @@ def fit(self, X, y=None): self : object Fitted transformer. """ - self._validate_params() self._fit(X, y=y, force_transform=False) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): """Fit `PowerTransformer` to `X`, then transform `X`. @@ -3141,7 +3138,6 @@ def fit_transform(self, X, y=None): X_new : ndarray of shape (n_samples, n_features) Transformed data. """ - self._validate_params() return self._fit(X, y, force_transform=True) def _fit(self, X, y=None, force_transform=False): @@ -3150,24 +3146,37 @@ def _fit(self, X, y=None, force_transform=False): if not self.copy and not force_transform: # if call from fit() X = X.copy() # force copy so that fit does not change X inplace + n_samples = X.shape[0] + mean = np.mean(X, axis=0, dtype=np.float64) + var = np.var(X, axis=0, dtype=np.float64) + optim_function = { "box-cox": self._box_cox_optimize, "yeo-johnson": self._yeo_johnson_optimize, }[self.method] + + transform_function = { + "box-cox": boxcox, + "yeo-johnson": self._yeo_johnson_transform, + }[self.method] + with np.errstate(invalid="ignore"): # hide NaN warnings - self.lambdas_ = np.array([optim_function(col) for col in X.T]) + self.lambdas_ = np.empty(X.shape[1], dtype=X.dtype) + for i, col in enumerate(X.T): + # For yeo-johnson, leave constant features unchanged + # lambda=1 corresponds to the identity transformation + is_constant_feature = _is_constant_feature(var[i], mean[i], n_samples) + if self.method == "yeo-johnson" and is_constant_feature: + self.lambdas_[i] = 1.0 + continue + + self.lambdas_[i] = optim_function(col) - if self.standardize or force_transform: - transform_function = { - "box-cox": boxcox, - "yeo-johnson": self._yeo_johnson_transform, - }[self.method] - for i, lmbda in enumerate(self.lambdas_): - with np.errstate(invalid="ignore"): # hide NaN warnings - X[:, i] = transform_function(X[:, i], lmbda) + if self.standardize or force_transform: + X[:, i] = transform_function(X[:, i], self.lambdas_[i]) if self.standardize: - self._scaler = StandardScaler(copy=False) + self._scaler = StandardScaler(copy=False).set_output(transform="default") if force_transform: X = self._scaler.fit_transform(X) else: diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 220950586a6ef..ac7432027f462 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -11,6 +11,7 @@ from . import OneHotEncoder from ..base import BaseEstimator, TransformerMixin +from ..base import _fit_context from ..utils._param_validation import Hidden, Interval, StrOptions, Options from ..utils.validation import check_array from ..utils.validation import check_is_fitted @@ -192,6 +193,7 @@ def __init__( self.subsample = subsample self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None, sample_weight=None): """ Fit the estimator. @@ -216,7 +218,6 @@ def fit(self, X, y=None, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() X = self._validate_data(X, dtype="numeric") if self.dtype in (np.float64, np.float32): diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 1fc4b16a52467..de3f983d7ae6f 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -10,6 +10,7 @@ from scipy import sparse from ..base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin +from ..base import _fit_context from ..utils import check_array, is_scalar_nan, _safe_indexing from ..utils.validation import check_is_fitted from ..utils.validation import _check_feature_names_in @@ -953,6 +954,7 @@ def _compute_n_features_outs(self): return output + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """ Fit OneHotEncoder to X. @@ -971,8 +973,6 @@ def fit(self, X, y=None): self Fitted encoder. """ - self._validate_params() - if self.sparse != "deprecated": warnings.warn( ( @@ -1446,6 +1446,7 @@ def __init__( self.min_frequency = min_frequency self.max_categories = max_categories + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """ Fit the OrdinalEncoder to X. @@ -1464,8 +1465,6 @@ def fit(self, X, y=None): self : object Fitted encoder. """ - self._validate_params() - if self.handle_unknown == "use_encoded_value": if is_scalar_nan(self.unknown_value): if np.dtype(self.dtype).kind != "f": diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index c250c5cd0226e..d7bf1810e61c0 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -3,6 +3,7 @@ import numpy as np from ..base import BaseEstimator, TransformerMixin +from ..base import _fit_context from ..utils.metaestimators import available_if from ..utils.validation import ( _allclose_dense_sparse, @@ -197,6 +198,7 @@ def _check_inverse_transform(self, X): UserWarning, ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit transformer by checking X. @@ -216,7 +218,6 @@ def fit(self, X, y=None): self : object FunctionTransformer class instance. """ - self._validate_params() X = self._check_input(X, reset=True) if self.check_inverse and not (self.func is None or self.inverse_func is None): self._check_inverse_transform(X) diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index ca8607b06c2e2..f656329607ee3 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -16,7 +16,7 @@ import scipy.sparse as sp from ..base import BaseEstimator, TransformerMixin - +from ..base import _fit_context from ..utils.sparsefuncs import min_max_axis from ..utils._param_validation import Interval, validate_params from ..utils import column_or_1d @@ -268,6 +268,7 @@ def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False): self.pos_label = pos_label self.sparse_output = sparse_output + @_fit_context(prefer_skip_nested_validation=True) def fit(self, y): """Fit label binarizer. @@ -282,9 +283,6 @@ def fit(self, y): self : object Returns the instance itself. """ - - self._validate_params() - if self.neg_label >= self.pos_label: raise ValueError( f"neg_label={self.neg_label} must be strictly less than " @@ -761,6 +759,7 @@ def __init__(self, *, classes=None, sparse_output=False): self.classes = classes self.sparse_output = sparse_output + @_fit_context(prefer_skip_nested_validation=True) def fit(self, y): """Fit the label sets binarizer, storing :term:`classes_`. @@ -776,7 +775,6 @@ def fit(self, y): self : object Fitted estimator. """ - self._validate_params() self._cached_dict = None if self.classes is None: @@ -794,6 +792,7 @@ def fit(self, y): self.classes_[:] = classes return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, y): """Fit the label sets binarizer and transform the given label sets. @@ -814,7 +813,6 @@ def fit_transform(self, y): if self.classes is not None: return self.fit(y).transform(y) - self._validate_params() self._cached_dict = None # Automatically increment on new class diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 08ccf6355fc4e..1dfee8a088114 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -12,6 +12,7 @@ from scipy.special import comb from ..base import BaseEstimator, TransformerMixin +from ..base import _fit_context from ..utils import check_array from ..utils.fixes import sp_version, parse_version from ..utils.validation import check_is_fitted, FLOAT_DTYPES, _check_sample_weight @@ -299,6 +300,7 @@ def get_feature_names_out(self, input_features=None): feature_names.append(name) return np.asarray(feature_names, dtype=object) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """ Compute number of output features. @@ -316,7 +318,6 @@ def fit(self, X, y=None): self : object Fitted transformer. """ - self._validate_params() _, n_features = self._validate_data(X, accept_sparse=True).shape if isinstance(self.degree, Integral): @@ -802,6 +803,7 @@ def get_feature_names_out(self, input_features=None): feature_names.append(f"{input_features[i]}_sp_{j}") return np.asarray(feature_names, dtype=object) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None, sample_weight=None): """Compute knot positions of splines. @@ -823,8 +825,6 @@ def fit(self, X, y=None, sample_weight=None): self : object Fitted transformer. """ - self._validate_params() - X = self._validate_data( X, reset=True, diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py index 9100d72194a32..9dd33ddfa3cce 100644 --- a/sklearn/preprocessing/_target_encoder.py +++ b/sklearn/preprocessing/_target_encoder.py @@ -4,6 +4,7 @@ from ._encoders import _BaseEncoder from ..base import OneToOneFeatureMixin +from ..base import _fit_context from ._target_encoder_fast import _fit_encoding_fast from ._target_encoder_fast import _fit_encoding_fast_auto_smooth from ..utils.validation import _check_y, check_consistent_length @@ -176,6 +177,7 @@ def __init__( self.shuffle = shuffle self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit the :class:`TargetEncoder` to X and y. @@ -192,10 +194,10 @@ def fit(self, X, y): self : object Fitted encoder. """ - self._validate_params() self._fit_encodings_all(X, y) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y): """Fit :class:`TargetEncoder` and transform X with the target encoding. @@ -219,7 +221,6 @@ def fit_transform(self, X, y): """ from ..model_selection import KFold, StratifiedKFold # avoid circular import - self._validate_params() X_ordinal, X_known_mask, y, n_categories = self._fit_encodings_all(X, y) # The cv splitter is voluntarily restricted to *KFold to enforce non diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 2e6fd810fedac..c00de906a7dbb 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2669,3 +2669,22 @@ def test_kernel_centerer_feature_names_out(): names_out = centerer.get_feature_names_out() samples_out2 = X_pairwise.shape[1] assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)]) + + +@pytest.mark.parametrize("standardize", [True, False]) +def test_power_transformer_constant_feature(standardize): + """Check that PowerTransfomer leaves constant features unchanged.""" + X = [[-2, 0, 2], [-2, 0, 2], [-2, 0, 2]] + + pt = PowerTransformer(method="yeo-johnson", standardize=standardize).fit(X) + + assert_allclose(pt.lambdas_, [1, 1, 1]) + + Xft = pt.fit_transform(X) + Xt = pt.transform(X) + + for Xt_ in [Xft, Xt]: + if standardize: + assert_allclose(Xt_, np.zeros_like(X)) + else: + assert_allclose(Xt_, X) diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 9e9620e089521..ca0ee41784ab5 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -36,7 +36,7 @@ from .base import BaseEstimator, TransformerMixin from .base import ClassNamePrefixFeaturesOutMixin - +from .base import _fit_context from .utils import check_random_state from .utils._param_validation import Interval, StrOptions, validate_params from .utils.extmath import safe_sparse_dot @@ -356,6 +356,7 @@ def _compute_inverse_components(self): components = components.toarray() return linalg.pinv(components, check_finite=False) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Generate a sparse random projection matrix. @@ -374,7 +375,6 @@ def fit(self, X, y=None): self : object BaseRandomProjection class instance. """ - self._validate_params() X = self._validate_data( X, accept_sparse=["csr", "csc"], dtype=[np.float64, np.float32] ) diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index 95fad0713d558..9d7786bc1d67e 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -64,6 +64,7 @@ from scipy.sparse import csgraph from ..base import BaseEstimator, ClassifierMixin +from ..base import _fit_context from ..metrics.pairwise import rbf_kernel from ..neighbors import NearestNeighbors from ..utils.extmath import safe_sparse_dot @@ -230,6 +231,7 @@ class labels. probabilities /= normalizer return probabilities + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit a semi-supervised label propagation model to X. @@ -254,7 +256,6 @@ def fit(self, X, y): self : object Returns the instance itself. """ - self._validate_params() X, y = self._validate_data( X, y, diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py index 2438658ed89c8..c4706df1754da 100644 --- a/sklearn/semi_supervised/_self_training.py +++ b/sklearn/semi_supervised/_self_training.py @@ -4,6 +4,7 @@ import numpy as np from ..base import MetaEstimatorMixin, clone, BaseEstimator +from ..base import _fit_context from ..utils._param_validation import HasMethods, Interval, StrOptions from ..utils.validation import check_is_fitted from ..utils.metaestimators import available_if @@ -171,6 +172,10 @@ def __init__( self.max_iter = max_iter self.verbose = verbose + @_fit_context( + # SelfTrainingClassifier.base_estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """ Fit self-training classifier using `X`, `y` as training data. @@ -189,8 +194,6 @@ def fit(self, X, y): self : object Fitted estimator. """ - self._validate_params() - # we need row slicing support for sparce matrices, but costly finiteness check # can be delegated to the base estimator. X, y = self._validate_data( diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 55919099e027c..a54c31cecb6e1 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -11,6 +11,7 @@ from . import _liblinear as liblinear # type: ignore from . import _libsvm_sparse as libsvm_sparse # type: ignore from ..base import BaseEstimator, ClassifierMixin +from ..base import _fit_context from ..preprocessing import LabelEncoder from ..utils.multiclass import _ovr_decision_function from ..utils import check_array, check_random_state @@ -143,6 +144,7 @@ def _more_tags(self): # Used by cross_val_score. return {"pairwise": self.kernel == "precomputed"} + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the SVM model according to the given training data. @@ -176,8 +178,6 @@ def fit(self, X, y, sample_weight=None): If X is a dense array, then the other methods will not support sparse matrices as input. """ - self._validate_params() - rnd = check_random_state(self.random_state) sparse = sp.isspmatrix(X) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index e035e74a05e2c..a438d007da970 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -5,6 +5,7 @@ from ._base import _fit_liblinear, _get_liblinear_solver_type, BaseSVC, BaseLibSVM from ..base import BaseEstimator, RegressorMixin, OutlierMixin +from ..base import _fit_context from ..linear_model._base import LinearClassifierMixin, SparseCoefMixin, LinearModel from ..utils import deprecated from ..utils.validation import _num_samples @@ -272,6 +273,7 @@ def __init__( self.penalty = penalty self.loss = loss + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. @@ -296,8 +298,6 @@ def fit(self, X, y, sample_weight=None): self : object An instance of the estimator. """ - self._validate_params() - X, y = self._validate_data( X, y, @@ -529,6 +529,7 @@ def __init__( self.dual = dual self.loss = loss + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. @@ -553,8 +554,6 @@ def fit(self, X, y, sample_weight=None): self : object An instance of the estimator. """ - self._validate_params() - X, y = self._validate_data( X, y, diff --git a/sklearn/tests/test_metadata_routing.py b/sklearn/tests/test_metadata_routing.py index 3b00b5a244ee8..a6e74c12f6e45 100644 --- a/sklearn/tests/test_metadata_routing.py +++ b/sklearn/tests/test_metadata_routing.py @@ -653,6 +653,21 @@ def fit(self, X, y, prop=None, **kwargs): Klass().fit(None, None) # for coverage +def test_removing_non_existing_param_raises(): + """Test that removing a metadata using UNUSED which doesn't exist raises.""" + + class InvalidRequestRemoval(BaseEstimator): + # `fit` (in this class or a parent) requests `prop`, but we don't want + # it requested at all. + __metadata_request__fit = {"prop": metadata_routing.UNUSED} + + def fit(self, X, y, **kwargs): + return self + + with pytest.raises(ValueError, match="Trying to remove parameter"): + InvalidRequestRemoval().get_metadata_routing() + + def test_method_metadata_request(): mmr = MethodMetadataRequest(owner="test", method="fit") diff --git a/sklearn/tests/test_public_functions.py b/sklearn/tests/test_public_functions.py index 3157e344cbef3..99f7f22d92e3d 100644 --- a/sklearn/tests/test_public_functions.py +++ b/sklearn/tests/test_public_functions.py @@ -241,6 +241,7 @@ def _check_function_param_validation( "sklearn.metrics.pairwise.manhattan_distances", "sklearn.metrics.pairwise.nan_euclidean_distances", "sklearn.metrics.pairwise.paired_cosine_distances", + "sklearn.metrics.pairwise.paired_distances", "sklearn.metrics.pairwise.paired_euclidean_distances", "sklearn.metrics.pairwise.paired_manhattan_distances", "sklearn.metrics.pairwise.polynomial_kernel", diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 4fdd8f27cd652..64a444db0b228 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -31,6 +31,7 @@ from sklearn.base import RegressorMixin from sklearn.base import is_classifier from sklearn.base import MultiOutputMixin +from sklearn.base import _fit_context from sklearn.utils import Bunch from sklearn.utils import check_random_state from sklearn.utils.validation import _check_sample_weight @@ -120,6 +121,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None], "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")], "ccp_alpha": [Interval(Real, 0.0, None, closed="left")], + "store_leaf_values": [bool], } @abstractmethod @@ -138,6 +140,7 @@ def __init__( min_impurity_decrease, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, ): self.criterion = criterion self.splitter = splitter @@ -151,6 +154,7 @@ def __init__( self.min_impurity_decrease = min_impurity_decrease self.class_weight = class_weight self.ccp_alpha = ccp_alpha + self.store_leaf_values = store_leaf_values def get_depth(self): """Return the depth of the decision tree. @@ -180,7 +184,7 @@ def get_n_leaves(self): def _support_missing_values(self, X): return not issparse(X) and self._get_tags()["allow_nan"] - def _compute_feature_has_missing(self, X): + def _compute_missing_values_in_feature_mask(self, X): """Return boolean mask denoting if there are missing values for each feature. This method also ensures that X is finite. @@ -192,7 +196,7 @@ def _compute_feature_has_missing(self, X): Returns ------- - feature_has_missing : ndarray of shape (n_features,), or None + missing_values_in_feature_mask : ndarray of shape (n_features,), or None Missing value mask. If missing values are not supported or there are no missing values, return None. """ @@ -213,13 +217,17 @@ def _compute_feature_has_missing(self, X): if not np.isnan(overall_sum): return None - feature_has_missing = _any_isnan_axis0(X) - return feature_has_missing + missing_values_in_feature_mask = _any_isnan_axis0(X) + return missing_values_in_feature_mask def _fit( - self, X, y, sample_weight=None, check_input=True, feature_has_missing=None + self, + X, + y, + sample_weight=None, + check_input=True, + missing_values_in_feature_mask=None, ): - self._validate_params() random_state = check_random_state(self.random_state) if check_input: @@ -227,7 +235,7 @@ def _fit( # We can't pass multi_output=True because that would allow y to be # csr. - # _compute_feature_has_missing will check for finite values and + # _compute_missing_values_in_feature_mask will check for finite values and # compute the missing mask if the tree supports missing values check_X_params = dict( dtype=DTYPE, accept_sparse="csc", force_all_finite=False @@ -240,7 +248,9 @@ def _fit( else: X = self._validate_data(X, **check_X_params) - feature_has_missing = self._compute_feature_has_missing(X) + missing_values_in_feature_mask = ( + self._compute_missing_values_in_feature_mask(X) + ) if issparse(X): X.sort_indices() @@ -388,7 +398,7 @@ def _fit( X, y, sample_weight, - feature_has_missing, + missing_values_in_feature_mask, min_samples_leaf, min_weight_leaf, max_leaf_nodes, @@ -397,6 +407,9 @@ def _fit( random_state, ) + if self.store_leaf_values: + self.leaf_nodes_samples_ = self.tree_.leaf_nodes_samples + return self def _build_tree( @@ -404,7 +417,7 @@ def _build_tree( X, y, sample_weight, - feature_has_missing, + missing_values_in_feature_mask, min_samples_leaf, min_weight_leaf, max_leaf_nodes, @@ -483,6 +496,7 @@ def _build_tree( min_weight_leaf, max_depth, self.min_impurity_decrease, + self.store_leaf_values, ) else: builder = BestFirstTreeBuilder( @@ -493,9 +507,10 @@ def _build_tree( max_depth, max_leaf_nodes, self.min_impurity_decrease, + self.store_leaf_values, ) - builder.build(self.tree_, X, y, sample_weight, feature_has_missing) + builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask) if self.n_outputs_ == 1 and is_classifier(self): self.n_classes_ = self.n_classes_[0] @@ -551,6 +566,9 @@ def predict(self, X, check_input=True): """ check_is_fitted(self) X = self._validate_X_predict(X, check_input) + + # proba is a count matrix of leaves that fall into + # (n_samples, n_outputs, max_n_classes) array proba = self.tree_.predict(X) n_samples = X.shape[0] @@ -577,6 +595,128 @@ def predict(self, X, check_input=True): else: return proba[:, :, 0] + def get_leaf_node_samples(self, X, check_input=True): + """For each datapoint x in X, get the training samples in the leaf node. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Dataset to apply the forest to. + + Returns + ------- + leaf_nodes_samples : a list of array-like of shape + (n_leaf_node_samples, n_outputs) + Each sample is represented by the indices of the training samples that + reached the leaf node. The ``n_leaf_node_samples`` may vary between + samples, since the number of samples that fall in a leaf node is + variable. + """ + if not self.store_leaf_values: + raise RuntimeError( + "leaf node samples are not stored when store_leaf_values=False" + ) + + # get indices of leaves per sample (n_samples,) + X_leaves = self.apply(X, check_input=check_input) + n_samples = X_leaves.shape[0] + + # get array of samples per leaf (n_node_samples, n_outputs) + leaf_samples = self.tree_.leaf_nodes_samples + + leaf_nodes_samples = [] + for idx in range(n_samples): + leaf_id = X_leaves[idx] + leaf_nodes_samples.append(leaf_samples[leaf_id]) + return leaf_nodes_samples + + def predict_quantiles(self, X, quantiles=0.5, method="nearest", check_input=True): + """Predict class or regression value for X at given quantiles. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + quantiles : float, optional + The quantiles at which to evaluate, by default 0.5 (median). + method : str, optional + The method to interpolate, by default 'linear'. Can be any keyword + argument accepted by :func:`np.quantile`. + check_input : bool, optional + Whether or not to check input, by default True. + + Returns + ------- + predictions : array-like of shape (n_samples, n_outputs, len(quantiles)) + The predicted quantiles. + """ + if not self.store_leaf_values: + raise RuntimeError( + "Predicting quantiles requires that the tree stores leaf node samples." + ) + + check_is_fitted(self) + + # Check data + X = self._validate_X_predict(X, check_input) + + if not isinstance(quantiles, (np.ndarray, list)): + quantiles = np.array([quantiles]) + + # get indices of leaves per sample + X_leaves = self.apply(X) + + # get array of samples per leaf (n_node_samples, n_outputs) + leaf_samples = self.tree_.leaf_nodes_samples + + # compute quantiles (n_samples, n_quantiles, n_outputs) + n_samples = X.shape[0] + n_quantiles = len(quantiles) + proba = np.zeros((n_samples, n_quantiles, self.n_outputs_)) + for idx, leaf_id in enumerate(X_leaves): + # predict by taking the quantile across the samples in the leaf for + # each output + proba[idx, ...] = np.quantile( + leaf_samples[leaf_id], quantiles, axis=0, interpolation=method + ) + + # Classification + if is_classifier(self): + if self.n_outputs_ == 1: + # return the class with the highest probability for each quantile + # (n_samples, n_quantiles) + class_preds = np.zeros( + (n_samples, n_quantiles), dtype=self.classes_.dtype + ) + for i in range(n_quantiles): + class_pred_per_sample = ( + proba[:, i, :].squeeze().astype(self.classes_.dtype) + ) + class_preds[:, i] = self.classes_.take( + class_pred_per_sample, axis=0 + ) + return class_preds + else: + class_type = self.classes_[0].dtype + predictions = np.zeros( + (n_samples, n_quantiles, self.n_outputs_), dtype=class_type + ) + for k in range(self.n_outputs_): + for i in range(n_quantiles): + class_pred_per_sample = proba[:, i, k].squeeze().astype(int) + predictions[:, i, k] = self.classes_[k].take( + class_pred_per_sample, axis=0 + ) + + return predictions + # Regression + else: + if self.n_outputs_ == 1: + return proba[:, :, 0] + + else: + return proba + def apply(self, X, check_input=True): """Return the index of the leaf that each sample is predicted as. @@ -851,6 +991,16 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + Attributes ---------- classes_ : ndarray of shape (n_classes,) or list of ndarray @@ -896,6 +1046,9 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. + leaf_nodes_samples_ : dict + A dictionary of leaf node index and the y_train samples in that leaf. + See Also -------- DecisionTreeRegressor : A decision tree regressor. @@ -965,6 +1118,7 @@ def __init__( min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, ): super().__init__( criterion=criterion, @@ -979,8 +1133,10 @@ def __init__( random_state=random_state, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None, check_input=True): """Build a decision tree classifier from the training set (X, y). @@ -1327,6 +1483,7 @@ def __init__( max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.0, + store_leaf_values=False, ): super().__init__( criterion=criterion, @@ -1340,8 +1497,10 @@ def __init__( random_state=random_state, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None, check_input=True): """Build a decision tree regressor from the training set (X, y). @@ -1653,6 +1812,7 @@ def __init__( min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, ): super().__init__( criterion=criterion, @@ -1667,6 +1827,7 @@ def __init__( min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, ) @@ -1880,6 +2041,7 @@ def __init__( min_impurity_decrease=0.0, max_leaf_nodes=None, ccp_alpha=0.0, + store_leaf_values=False, ): super().__init__( criterion=criterion, @@ -1893,4 +2055,5 @@ def __init__( min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, ) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 721b475f40436..31c10ccfe4f93 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -92,7 +92,7 @@ cdef class Criterion(BaseCriterion): cdef void node_samples( self, - vector[vector[DOUBLE_t]]* dest + vector[vector[DOUBLE_t]]& dest ) noexcept nogil cdef class ClassificationCriterion(Criterion): diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index c3f08ec859bee..dfa64c1184df5 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -258,9 +258,17 @@ cdef class Criterion(BaseCriterion): cdef void node_samples( self, - vector[vector[DOUBLE_t]]* dest + vector[vector[DOUBLE_t]]& dest ) noexcept nogil: - cdef SIZE_t i, j + """Copy the samples of the current node into dest. + + Parameters + ---------- + dest : reference vector[vector[DOUBLE_t]] + The vector of vectors where the samples should be copied. + This is passed by reference and modified in place. + """ + cdef SIZE_t i, j, k # Resize the destination vector of vectors dest.resize(self.n_node_samples) @@ -272,7 +280,8 @@ cdef class Criterion(BaseCriterion): # Get the sample values for each output for k in range(self.n_outputs): - dest[i][k].push_back(self.y[j, k]) + dest[i].push_back(self.y[j, k]) + cdef inline void _move_sums_classification( ClassificationCriterion criterion, diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index fb21f676e66cc..915b2baa30e94 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -104,10 +104,10 @@ cdef class Splitter(BaseSplitter): object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, - const unsigned char[::1] feature_has_missing, + const unsigned char[::1] missing_values_in_feature_mask, ) except -1 - cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil + cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil # Methods that allow modifications to stopping conditions cdef bint check_presplit_conditions( diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 7f21d5da545fb..1f3d164370b95 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -168,7 +168,7 @@ cdef class Splitter(BaseSplitter): object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, - const unsigned char[::1] feature_has_missing, + const unsigned char[::1] missing_values_in_feature_mask, ) except -1: """Initialize the splitter. @@ -245,7 +245,7 @@ cdef class Splitter(BaseSplitter): self.end ) - if feature_has_missing is not None: + if missing_values_in_feature_mask is not None: self.criterion.init_sum_missing() return 0 @@ -280,7 +280,7 @@ cdef class Splitter(BaseSplitter): self.criterion.node_value(dest) - cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil: + cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil: """Copy the samples[start:end] into dest.""" self.criterion.node_samples(dest) @@ -903,19 +903,19 @@ cdef class DensePartitioner: cdef SIZE_t start cdef SIZE_t end cdef SIZE_t n_missing - cdef const unsigned char[::1] feature_has_missing + cdef const unsigned char[::1] missing_values_in_feature_mask def __init__( self, const DTYPE_t[:, :] X, SIZE_t[::1] samples, DTYPE_t[::1] feature_values, - const unsigned char[::1] feature_has_missing, + const unsigned char[::1] missing_values_in_feature_mask, ): self.X = X self.samples = samples self.feature_values = feature_values - self.feature_has_missing = feature_has_missing + self.missing_values_in_feature_mask = missing_values_in_feature_mask cdef inline void init_node_split(self, SIZE_t start, SIZE_t end) noexcept nogil: """Initialize splitter at the beginning of node_split.""" @@ -938,13 +938,13 @@ cdef class DensePartitioner: const DTYPE_t[:, :] X = self.X SIZE_t[::1] samples = self.samples SIZE_t n_missing = 0 - const unsigned char[::1] feature_has_missing = self.feature_has_missing + const unsigned char[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask # Sort samples along that feature; by # copying the values into an array and # sorting the array in a manner which utilizes the cache more # effectively. - if feature_has_missing is not None and feature_has_missing[current_feature]: + if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]: i, current_end = self.start, self.end - 1 # Missing values are placed at the end and do not participate in the sorting. while i <= current_end: @@ -1113,7 +1113,7 @@ cdef class SparsePartitioner: cdef SIZE_t start cdef SIZE_t end cdef SIZE_t n_missing - cdef const unsigned char[::1] feature_has_missing + cdef const unsigned char[::1] missing_values_in_feature_mask cdef const DTYPE_t[::1] X_data cdef const INT32_t[::1] X_indices @@ -1134,7 +1134,7 @@ cdef class SparsePartitioner: SIZE_t[::1] samples, SIZE_t n_samples, DTYPE_t[::1] feature_values, - const unsigned char[::1] feature_has_missing, + const unsigned char[::1] missing_values_in_feature_mask, ): if not isspmatrix_csc(X): raise ValueError("X should be in csc format") @@ -1158,7 +1158,7 @@ cdef class SparsePartitioner: for p in range(n_samples): self.index_to_samples[samples[p]] = p - self.feature_has_missing = feature_has_missing + self.missing_values_in_feature_mask = missing_values_in_feature_mask cdef inline void init_node_split(self, SIZE_t start, SIZE_t end) noexcept nogil: """Initialize splitter at the beginning of node_split.""" @@ -1529,11 +1529,11 @@ cdef class BestSplitter(Splitter): object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, - const unsigned char[::1] feature_has_missing, + const unsigned char[::1] missing_values_in_feature_mask, ) except -1: - Splitter.init(self, X, y, sample_weight, feature_has_missing) + Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) self.partitioner = DensePartitioner( - X, self.samples, self.feature_values, feature_has_missing + X, self.samples, self.feature_values, missing_values_in_feature_mask ) cdef int node_split(self, double impurity, SplitRecord* split, @@ -1555,11 +1555,11 @@ cdef class BestSparseSplitter(Splitter): object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, - const unsigned char[::1] feature_has_missing, + const unsigned char[::1] missing_values_in_feature_mask, ) except -1: - Splitter.init(self, X, y, sample_weight, feature_has_missing) + Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) self.partitioner = SparsePartitioner( - X, self.samples, self.n_samples, self.feature_values, feature_has_missing + X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask ) cdef int node_split(self, double impurity, SplitRecord* split, @@ -1581,11 +1581,11 @@ cdef class RandomSplitter(Splitter): object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, - const unsigned char[::1] feature_has_missing, + const unsigned char[::1] missing_values_in_feature_mask, ) except -1: - Splitter.init(self, X, y, sample_weight, feature_has_missing) + Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) self.partitioner = DensePartitioner( - X, self.samples, self.feature_values, feature_has_missing + X, self.samples, self.feature_values, missing_values_in_feature_mask ) cdef int node_split(self, double impurity, SplitRecord* split, @@ -1607,11 +1607,11 @@ cdef class RandomSparseSplitter(Splitter): object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, - const unsigned char[::1] feature_has_missing, + const unsigned char[::1] missing_values_in_feature_mask, ) except -1: - Splitter.init(self, X, y, sample_weight, feature_has_missing) + Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) self.partitioner = SparsePartitioner( - X, self.samples, self.n_samples, self.feature_values, feature_has_missing + X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask ) cdef int node_split(self, double impurity, SplitRecord* split, diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 94714cc33400c..828c99a2f4ea1 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -49,13 +49,6 @@ cdef class BaseTree: cdef SIZE_t value_stride # The dimensionality of a vectorized output per sample cdef double* value # Array of values prediction values for each node - # Enables the use of tree to store distributions of the output to allow - # arbitrary usage of the the leaves. This is used in the quantile - # estimators for example. - # for storing samples at each leaf node with leaf's node ID as the key and - # the sample values as the value - cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples - # Generic Methods: These are generic methods used by any tree. cdef int _resize(self, SIZE_t capacity) except -1 nogil cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil @@ -121,9 +114,18 @@ cdef class Tree(BaseTree): cdef public SIZE_t n_outputs # Number of outputs in y cdef public SIZE_t max_n_classes # max(n_classes) + # Enables the use of tree to store distributions of the output to allow + # arbitrary usage of the the leaves. This is used in the quantile + # estimators for example. + # for storing samples at each leaf node with leaf's node ID as the key and + # the sample values as the value + cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples + # Methods cdef cnp.ndarray _get_value_ndarray(self) cdef cnp.ndarray _get_node_ndarray(self) + cdef cnp.ndarray _get_value_samples_ndarray(self, SIZE_t node_id) + cdef cnp.ndarray _get_value_samples_keys(self) cpdef cnp.ndarray predict(self, object X) @@ -146,7 +148,7 @@ cdef class TreeBuilder: cdef SIZE_t max_depth # Maximal tree depth cdef double min_impurity_decrease # Impurity threshold for early stopping - cdef unsigned char store_leaf_values # Whether to store leaf values + cdef unsigned char store_leaf_values # Whether to store leaf values cpdef build( self, @@ -154,7 +156,7 @@ cdef class TreeBuilder: object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight=*, - const unsigned char[::1] feature_has_missing=*, + const unsigned char[::1] missing_values_in_feature_mask=*, ) cdef _check_input( diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 8ca98a64b42ab..1565ab441969d 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -100,7 +100,7 @@ cdef class TreeBuilder: object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight=None, - const unsigned char[::1] feature_has_missing=None, + const unsigned char[::1] missing_values_in_feature_mask=None, ): """Build a decision tree from the training set (X, y).""" pass @@ -182,7 +182,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight=None, - const unsigned char[::1] feature_has_missing=None, + const unsigned char[::1] missing_values_in_feature_mask=None, ): """Build a decision tree from the training set (X, y).""" @@ -208,7 +208,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef double min_impurity_decrease = self.min_impurity_decrease # Recursive partition (without actual recursion) - splitter.init(X, y, sample_weight, feature_has_missing) + splitter.init(X, y, sample_weight, missing_values_in_feature_mask) cdef SIZE_t start cdef SIZE_t end @@ -229,8 +229,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef SIZE_t max_depth_seen = -1 cdef int rc = 0 - cdef int node_idx - cdef stack[StackRecord] builder_stack cdef StackRecord stack_record @@ -319,11 +317,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "impurity": split.impurity_left, "n_constant_features": n_constant_features}) elif self.store_leaf_values and is_leaf: - with gil: - print('Storing leaf values...') - # copy leaf values to leaf_values array - splitter.node_samples(&tree.value_samples[node_id]) + splitter.node_samples(tree.value_samples[node_id]) if depth > max_depth_seen: max_depth_seen = depth @@ -406,7 +401,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight=None, - const unsigned char[::1] feature_has_missing=None, + const unsigned char[::1] missing_values_in_feature_mask=None, ): """Build a decision tree from the training set (X, y).""" @@ -418,7 +413,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes # Recursive partition (without actual recursion) - splitter.init(X, y, sample_weight, feature_has_missing) + splitter.init(X, y, sample_weight, missing_values_in_feature_mask) cdef vector[FrontierRecord] frontier cdef FrontierRecord record @@ -459,6 +454,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder): node.feature = _TREE_UNDEFINED node.threshold = _TREE_UNDEFINED + if self.store_leaf_values: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[record.node_id]) else: # Node is expandable @@ -1321,6 +1319,14 @@ cdef class Tree(BaseTree): def value(self): return self._get_value_ndarray()[:self.node_count] + @property + def leaf_nodes_samples(self): + leaf_node_samples = dict() + keys = self._get_value_samples_keys() + for node_id in keys: + leaf_node_samples[node_id] = self._get_value_samples_ndarray(node_id) + return leaf_node_samples + # TODO: Convert n_classes to cython.integral memory view once # https://github.com/cython/cython/issues/5243 is fixed def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): @@ -1374,6 +1380,7 @@ cdef class Tree(BaseTree): d["node_count"] = self.node_count d["nodes"] = self._get_node_ndarray() d["values"] = self._get_value_ndarray() + d['value_samples'] = self.leaf_nodes_samples return d def __setstate__(self, d): @@ -1407,6 +1414,35 @@ cdef class Tree(BaseTree): memcpy(self.value, cnp.PyArray_DATA(value_ndarray), self.capacity * self.value_stride * sizeof(double)) + # store the leaf node samples if they exist + value_samples_dict = d['value_samples'] + for node_id, leaf_samples in value_samples_dict.items(): + self.value_samples[node_id].resize(leaf_samples.shape[0]) + for idx in range(leaf_samples.shape[0]): + for jdx in range(leaf_samples.shape[1]): + self.value_samples[node_id][idx].push_back(leaf_samples[idx, jdx]) + + cdef cnp.ndarray _get_value_samples_ndarray(self, SIZE_t node_id): + """Wraps value_samples as a 2-d NumPy array per node_id.""" + cdef int i, j + cdef int n_samples = self.value_samples[node_id].size() + cdef cnp.ndarray[DOUBLE_t, ndim=2, mode='c'] leaf_node_samples = np.empty(shape=(n_samples, self.n_outputs), dtype=np.float64) + + for i in range(n_samples): + for j in range(self.n_outputs): + leaf_node_samples[i, j] = self.value_samples[node_id][i][j] + return leaf_node_samples + + cdef cnp.ndarray _get_value_samples_keys(self): + """Wraps value_samples keys as a 1-d NumPy array of keys.""" + cdef cnp.ndarray[SIZE_t, ndim=1, mode='c'] keys = np.empty(len(self.value_samples), dtype=np.intp) + cdef unsigned int i = 0 + + for key in self.value_samples: + keys[i] = key.first + i += 1 + return keys + cdef cnp.ndarray _get_value_ndarray(self): """Wraps value as a 3-d NumPy array. diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index eefae6cdaa3f6..44a19b3dc0520 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -890,7 +890,7 @@ def test_pickle(): else: X, y = diabetes.data, diabetes.target - est = TreeEstimator(random_state=0) + est = TreeEstimator(random_state=0, store_leaf_values=True) est.fit(X, y) score = est.score(X, y) @@ -909,6 +909,7 @@ def test_pickle(): "n_node_samples", "weighted_n_node_samples", "value", + "leaf_nodes_samples", ] fitted_attribute = { attribute: getattr(est.tree_, attribute) for attribute in attributes @@ -923,14 +924,25 @@ def test_pickle(): score == score2 ), "Failed to generate same score after pickling with {0}".format(name) for attribute in fitted_attribute: - assert_array_equal( - getattr(est2.tree_, attribute), - fitted_attribute[attribute], - err_msg=( - f"Failed to generate same attribute {attribute} after pickling with" - f" {name}" - ), - ) + if attribute == "leaf_nodes_samples": + for key in fitted_attribute[attribute].keys(): + assert_array_equal( + getattr(est2.tree_, attribute)[key], + fitted_attribute[attribute][key], + err_msg=( + f"Failed to generate same attribute {attribute} after" + f" pickling with {name}" + ), + ) + else: + assert_array_equal( + getattr(est2.tree_, attribute), + fitted_attribute[attribute], + err_msg=( + f"Failed to generate same attribute {attribute} after pickling" + f" with {name}" + ), + ) def test_multioutput(): @@ -2634,3 +2646,148 @@ def test_sample_weight_non_uniform(make_data, Tree): tree_samples_removed.fit(X[1::2, :], y[1::2]) assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X)) + + +@pytest.mark.parametrize( + "tree_name", + ALL_TREES, +) +def test_leaf_node_samples(tree_name): + """Test getting leaf node samples from fitted tree.""" + tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=False) + tree.fit(X_small, y_small) + + # Check that the leaf node samples are not stored by default + assert tree.tree_.leaf_nodes_samples == dict() + + # error should be raised if trying to predict quantiles + assert hasattr(tree, "predict_quantiles") + for meth in ["predict_quantiles", "get_leaf_node_samples"]: + if hasattr(tree, meth): + with pytest.raises( + RuntimeError, + match="leaf node samples", + ): + getattr(tree, meth)(X_small) + + quantile_tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=True) + quantile_tree.fit(X_small, y_small) + + score = tree.score(X_small, y_small) + new_score = quantile_tree.score(X_small, y_small) + assert np.isclose(score, new_score) + + # Check that the leaf node samples are what they should be + X_leaves = quantile_tree.apply(X_small) + for idx in range(X_leaves.shape[0]): + leaf_idx = X_leaves[idx] + assert y_small[idx] in quantile_tree.tree_.leaf_nodes_samples[leaf_idx] + assert set(np.unique(X_leaves)) == set( + quantile_tree.tree_.leaf_nodes_samples.keys() + ) + + +@pytest.mark.parametrize( + "name", + ALL_TREES, +) +def test_quantile_tree_predict(name): + TreeEstimator = ALL_TREES[name] + + # test quantile prediction + est = TreeEstimator(store_leaf_values=True, random_state=0) + + # fit on binary results in perfect leaves, so all quantiles are the same + est.fit(X_small, y_small) + pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(est.predict(X_small), pred[:, 0]) + assert_array_equal(est.predict(X_small), pred[:, 1]) + assert_array_equal(est.predict(X_small), pred[:, 2]) + assert_array_equal(pred[:, 0], y_small) + assert np.unique(pred, axis=1).shape[1] == 1 + + est.fit(X_small[:-5], y_small[:-5]) + held_out_X = X_small[-5:, :] + pred = est.predict_quantiles(held_out_X, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(est.predict(held_out_X), pred[:, 0]) + assert_array_equal(est.predict(held_out_X), pred[:, 1]) + assert_array_equal(est.predict(held_out_X), pred[:, 2]) + + # fit on real data + est.fit(iris.data, iris.target) + pred = est.predict_quantiles(iris.data, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(pred[:, 0], iris.target) + assert_array_equal(pred[:, 1], iris.target) + assert_array_equal(pred[:, 2], iris.target) + + +@pytest.mark.parametrize( + "name", + ALL_TREES, +) +def test_quantile_tree_predict_impure_leaves(name): + TreeEstimator = ALL_TREES[name] + + # test quantile prediction + est = TreeEstimator(store_leaf_values=True, random_state=0, max_depth=4) + # fit on binary results with constrained depth will result in impure leaves + est.fit(X_small, y_small) + pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9]) + assert np.unique(pred, axis=1).shape[1] > 1 + + +def test_multioutput_quantiles(): + # Check estimators on multi-output problems. + X = [ + [-2, -1], + [-1, -1], + [-1, -2], + [1, 1], + [1, 2], + [2, 1], + [-2, 1], + [-1, 1], + [-1, 2], + [2, -1], + [1, -1], + [1, -2], + ] + + y = [ + [-1, 0], + [-1, 0], + [-1, 0], + [1, 1], + [1, 1], + [1, 1], + [-1, 2], + [-1, 2], + [-1, 2], + [1, 3], + [1, 3], + [1, 3], + ] + + T = [[-1, -1], [1, 1], [-1, 1], [1, -1]] + y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]] + + # toy classification problem + for name, TreeClassifier in CLF_TREES.items(): + clf = TreeClassifier(random_state=0, store_leaf_values=True) + clf.fit(X, y) + + y_hat = clf.predict_quantiles(T, quantiles=[0.25, 0.5, 0.75]) + y_hat = y_hat.squeeze() + assert_array_equal(y_hat[:, 0], y_true) + assert_array_equal(y_hat[:, 1], y_true) + assert_array_equal(y_hat[:, 2], y_true) + assert y_hat.shape == (4, 3, 2) + + # toy regression problem + for name, TreeRegressor in REG_TREES.items(): + reg = TreeRegressor(random_state=0, store_leaf_values=True) + y_hat = reg.fit(X, y).predict_quantiles(T, quantiles=[0.25, 0.5, 0.75]) + assert_array_equal(y_hat[:, 0], y_true) + assert_array_equal(y_hat[:, 1], y_true) + assert_array_equal(y_hat[:, 2], y_true) + assert y_hat.shape == (4, 3, 2) diff --git a/sklearn/utils/_metadata_requests.py b/sklearn/utils/_metadata_requests.py index 82b3eec69b461..a1cd934c13756 100644 --- a/sklearn/utils/_metadata_requests.py +++ b/sklearn/utils/_metadata_requests.py @@ -241,8 +241,14 @@ def add_request( if alias == param: alias = True - if alias == UNUSED and param in self._requests: - del self._requests[param] + if alias == UNUSED: + if param in self._requests: + del self._requests[param] + else: + raise ValueError( + f"Trying to remove parameter {param} with UNUSED which doesn't" + " exist." + ) else: self._requests[param] = alias @@ -1155,7 +1161,7 @@ def _build_request_for_signature(cls, router, method): # ignore the first parameter of the method, which is usually "self" params = list(inspect.signature(getattr(cls, method)).parameters.items())[1:] for pname, param in params: - if pname in {"X", "y", "Y"}: + if pname in {"X", "y", "Y", "Xt", "yt"}: continue if param.kind in {param.VAR_POSITIONAL, param.VAR_KEYWORD}: continue diff --git a/sklearn/utils/_plotting.py b/sklearn/utils/_plotting.py index cc301b509e386..c0671046c9cd4 100644 --- a/sklearn/utils/_plotting.py +++ b/sklearn/utils/_plotting.py @@ -1,3 +1,5 @@ +import numpy as np + from . import check_consistent_length, check_matplotlib_support from .multiclass import type_of_target from .validation import _check_pos_label_consistency @@ -56,3 +58,41 @@ def _validate_from_predictions_params( name = name if name is not None else "Classifier" return pos_label, name + + +def _validate_score_name(score_name, scoring, negate_score): + """Validate the `score_name` parameter. + + If `score_name` is provided, we just return it as-is. + If `score_name` is `None`, we use `Score` if `negate_score` is `False` and + `Negative score` otherwise. + If `score_name` is a string or a callable, we infer the name. We replace `_` by + spaces and capitalize the first letter. We remove `neg_` and replace it by + `"Negative"` if `negate_score` is `False` or just remove it otherwise. + """ + if score_name is not None: + return score_name + elif scoring is None: + return "Negative score" if negate_score else "Score" + else: + score_name = scoring.__name__ if callable(scoring) else scoring + if negate_score: + if score_name.startswith("neg_"): + score_name = score_name[4:] + else: + score_name = f"Negative {score_name}" + elif score_name.startswith("neg_"): + score_name = f"Negative {score_name[4:]}" + score_name = score_name.replace("_", " ") + return score_name.capitalize() + + +def _interval_max_min_ratio(data): + """Compute the ratio between the largest and smallest inter-point distances. + + A value larger than 5 typically indicates that the parameter range would + better be displayed with a log scale while a linear scale would be more + suitable otherwise. + """ + diff = np.diff(np.sort(data)) + return diff.max() / diff.min() diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index cb1e0f2b1fa4d..7d8e673210ff7 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -4424,7 +4424,7 @@ def _output_from_fit_transform(transformer, name, X, df, y): return outputs -def _check_generated_dataframe(name, case, outputs_default, outputs_pandas): +def _check_generated_dataframe(name, case, index, outputs_default, outputs_pandas): import pandas as pd X_trans, feature_names_default = outputs_default @@ -4434,7 +4434,12 @@ def _check_generated_dataframe(name, case, outputs_default, outputs_pandas): # We always rely on the output of `get_feature_names_out` of the # transformer used to generate the dataframe as a ground-truth of the # columns. - expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas, copy=False) + # If a dataframe is passed into transform, then the output should have the same + # index + expected_index = index if case.endswith("df") else None + expected_dataframe = pd.DataFrame( + X_trans, columns=feature_names_pandas, copy=False, index=expected_index + ) try: pd.testing.assert_frame_equal(df_trans, expected_dataframe) @@ -4469,7 +4474,8 @@ def check_set_output_transform_pandas(name, transformer_orig): set_random_state(transformer) feature_names_in = [f"col{i}" for i in range(X.shape[1])] - df = pd.DataFrame(X, columns=feature_names_in, copy=False) + index = [f"index{i}" for i in range(X.shape[0])] + df = pd.DataFrame(X, columns=feature_names_in, copy=False, index=index) transformer_default = clone(transformer).set_output(transform="default") outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y) @@ -4483,7 +4489,7 @@ def check_set_output_transform_pandas(name, transformer_orig): for case in outputs_default: _check_generated_dataframe( - name, case, outputs_default[case], outputs_pandas[case] + name, case, index, outputs_default[case], outputs_pandas[case] ) @@ -4511,7 +4517,8 @@ def check_global_ouptut_transform_pandas(name, transformer_orig): set_random_state(transformer) feature_names_in = [f"col{i}" for i in range(X.shape[1])] - df = pd.DataFrame(X, columns=feature_names_in, copy=False) + index = [f"index{i}" for i in range(X.shape[0])] + df = pd.DataFrame(X, columns=feature_names_in, copy=False, index=index) transformer_default = clone(transformer).set_output(transform="default") outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y) @@ -4528,5 +4535,5 @@ def check_global_ouptut_transform_pandas(name, transformer_orig): for case in outputs_default: _check_generated_dataframe( - name, case, outputs_default[case], outputs_pandas[case] + name, case, index, outputs_default[case], outputs_pandas[case] ) diff --git a/sklearn/utils/tests/test_param_validation.py b/sklearn/utils/tests/test_param_validation.py index 528a667a3f58e..022f9f373a049 100644 --- a/sklearn/utils/tests/test_param_validation.py +++ b/sklearn/utils/tests/test_param_validation.py @@ -6,6 +6,7 @@ from sklearn._config import config_context, get_config from sklearn.base import BaseEstimator +from sklearn.base import _fit_context from sklearn.model_selection import LeaveOneOut from sklearn.utils import deprecated from sklearn.utils._param_validation import Hidden @@ -60,8 +61,9 @@ class _Estimator(BaseEstimator): def __init__(self, a): self.a = a + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X=None, y=None): - self._validate_params() + pass @pytest.mark.parametrize("interval_type", [Integral, Real]) diff --git a/sklearn/utils/tests/test_plotting.py b/sklearn/utils/tests/test_plotting.py new file mode 100644 index 0000000000000..00b1f7f74fcd0 --- /dev/null +++ b/sklearn/utils/tests/test_plotting.py @@ -0,0 +1,63 @@ +import numpy as np +import pytest + +from sklearn.utils._plotting import _validate_score_name, _interval_max_min_ratio + + +def metric(): + pass # pragma: no cover + + +def neg_metric(): + pass # pragma: no cover + + +@pytest.mark.parametrize( + "score_name, scoring, negate_score, expected_score_name", + [ + ("accuracy", None, False, "accuracy"), # do not transform the name + (None, "accuracy", False, "Accuracy"), # capitalize the name + (None, "accuracy", True, "Negative accuracy"), # add "Negative" + (None, "neg_mean_absolute_error", False, "Negative mean absolute error"), + (None, "neg_mean_absolute_error", True, "Mean absolute error"), # remove "neg_" + ("MAE", "neg_mean_absolute_error", True, "MAE"), # keep score_name + (None, None, False, "Score"), # default name + (None, None, True, "Negative score"), # default name but negated + ("Some metric", metric, False, "Some metric"), # do not transform the name + ("Some metric", metric, True, "Some metric"), # do not transform the name + (None, metric, False, "Metric"), # default name + (None, metric, True, "Negative metric"), # default name but negated + ("Some metric", neg_metric, False, "Some metric"), # do not transform the name + ("Some metric", neg_metric, True, "Some metric"), # do not transform the name + (None, neg_metric, False, "Negative metric"), # default name + (None, neg_metric, True, "Metric"), # default name but negated + ], +) +def test_validate_score_name(score_name, scoring, negate_score, expected_score_name): + """Check that we return the right score name.""" + assert ( + _validate_score_name(score_name, scoring, negate_score) == expected_score_name + ) + + +# In the following test, we check the value of the max to min ratio +# for parameter value intervals to check that using a decision threshold +# of 5. is a good heuristic to decide between linear and log scales on +# common ranges of parameter values. +@pytest.mark.parametrize( + "data, lower_bound, upper_bound", + [ + # Such a range could be clearly displayed with either log scale or linear + # scale. + (np.geomspace(0.1, 1, 5), 5, 6), + # Checking that the ratio is still positive on a negative log scale. + (-np.geomspace(0.1, 1, 10), 7, 8), + # Evenly spaced parameter values lead to a ratio of 1. + (np.linspace(0, 1, 5), 0.9, 1.1), + # This is not exactly spaced on a log scale but we will benefit from treating + # it as such for visualization. + ([1, 2, 5, 10, 20, 50], 20, 40), + ], +) +def test_inverval_max_min_ratio(data, lower_bound, upper_bound): + assert lower_bound < _interval_max_min_ratio(data) < upper_bound diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 4a765d1404794..2d39279f81745 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -42,6 +42,7 @@ from sklearn.utils import _safe_indexing from sklearn.utils.validation import ( has_fit_parameter, + _is_fitted, check_is_fitted, check_consistent_length, assert_all_finite, @@ -848,23 +849,32 @@ def fit(self, X, y): msg = "not fitted" est = MyEstimator() + assert not _is_fitted(est, attributes=["a_", "b_"]) with pytest.raises(NotFittedError, match=msg): check_is_fitted(est, attributes=["a_", "b_"]) + assert not _is_fitted(est, attributes=["a_", "b_"], all_or_any=all) with pytest.raises(NotFittedError, match=msg): check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all) + assert not _is_fitted(est, attributes=["a_", "b_"], all_or_any=any) with pytest.raises(NotFittedError, match=msg): check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any) est.a_ = "a" + assert not _is_fitted(est, attributes=["a_", "b_"]) with pytest.raises(NotFittedError, match=msg): check_is_fitted(est, attributes=["a_", "b_"]) + assert not _is_fitted(est, attributes=["a_", "b_"], all_or_any=all) with pytest.raises(NotFittedError, match=msg): check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all) + assert _is_fitted(est, attributes=["a_", "b_"], all_or_any=any) check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any) est.b_ = "b" + assert _is_fitted(est, attributes=["a_", "b_"]) check_is_fitted(est, attributes=["a_", "b_"]) + assert _is_fitted(est, attributes=["a_", "b_"], all_or_any=all) check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all) + assert _is_fitted(est, attributes=["a_", "b_"], all_or_any=any) check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 6179d91c2a491..8ceef15986567 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1369,6 +1369,44 @@ def check_symmetric(array, *, tol=1e-10, raise_warning=True, raise_exception=Fal return array +def _is_fitted(estimator, attributes=None, all_or_any=all): + """Determine if an estimator is fitted + + Parameters + ---------- + estimator : estimator instance + Estimator instance for which the check is performed. + + attributes : str, list or tuple of str, default=None + Attribute name(s) given as string or a list/tuple of strings + Eg.: ``["coef_", "estimator_", ...], "coef_"`` + + If `None`, `estimator` is considered fitted if there exist an + attribute that ends with a underscore and does not start with double + underscore. + + all_or_any : callable, {all, any}, default=all + Specify whether all or any of the given attributes must exist. + + Returns + ------- + fitted : bool + Whether the estimator is fitted. + """ + if attributes is not None: + if not isinstance(attributes, (list, tuple)): + attributes = [attributes] + return all_or_any([hasattr(estimator, attr) for attr in attributes]) + + if hasattr(estimator, "__sklearn_is_fitted__"): + return estimator.__sklearn_is_fitted__() + + fitted_attrs = [ + v for v in vars(estimator) if v.endswith("_") and not v.startswith("__") + ] + return len(fitted_attrs) > 0 + + def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all): """Perform is_fitted validation for estimator. @@ -1425,18 +1463,7 @@ def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all): if not hasattr(estimator, "fit"): raise TypeError("%s is not an estimator instance." % (estimator)) - if attributes is not None: - if not isinstance(attributes, (list, tuple)): - attributes = [attributes] - fitted = all_or_any([hasattr(estimator, attr) for attr in attributes]) - elif hasattr(estimator, "__sklearn_is_fitted__"): - fitted = estimator.__sklearn_is_fitted__() - else: - fitted = [ - v for v in vars(estimator) if v.endswith("_") and not v.startswith("__") - ] - - if not fitted: + if not _is_fitted(estimator, attributes, all_or_any): raise NotFittedError(msg % {"name": type(estimator).__name__}) From 855ee192407d19b51adb4f50a49c6752ee80c820 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 15 Jun 2023 20:32:20 -0400 Subject: [PATCH 15/39] Add quantile Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 2 +- sklearn/tree/_classes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index e715952947c04..b43bbeaf0b435 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -730,7 +730,7 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"): The quantiles at which to evaluate, by default 0.5 (median). method : str, optional The method to interpolate, by default 'linear'. Can be any keyword - argument accepted by :func:`np.quantile`. + argument accepted by :func:`~np.quantile`. check_input : bool, optional Whether or not to check input, by default True. diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 64a444db0b228..d7d8cedb63696 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -641,7 +641,7 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest", check_input=True The quantiles at which to evaluate, by default 0.5 (median). method : str, optional The method to interpolate, by default 'linear'. Can be any keyword - argument accepted by :func:`np.quantile`. + argument accepted by :func:`~np.quantile`. check_input : bool, optional Whether or not to check input, by default True. From 3f5cb6597e36a08f651f8f0eb7324e9658a14bea Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 16 Jun 2023 11:05:43 -0400 Subject: [PATCH 16/39] Add check input Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 2 -- sklearn/tree/_classes.py | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index b43bbeaf0b435..c51c489dbd5dd 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -731,8 +731,6 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"): method : str, optional The method to interpolate, by default 'linear'. Can be any keyword argument accepted by :func:`~np.quantile`. - check_input : bool, optional - Whether or not to check input, by default True. Returns ------- diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index d7d8cedb63696..78454b8854d26 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -602,6 +602,8 @@ def get_leaf_node_samples(self, X, check_input=True): ---------- X : array-like of shape (n_samples, n_features) Dataset to apply the forest to. + check_input : bool, default=True + Allow to bypass several input checking. Returns ------- From 7401ddcb19a42132cf46e79a14b22a2bdfb8519c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 16 Jun 2023 18:35:39 -0400 Subject: [PATCH 17/39] Try to fix docstring Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 78454b8854d26..c75c933c49b39 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -607,12 +607,11 @@ def get_leaf_node_samples(self, X, check_input=True): Returns ------- - leaf_nodes_samples : a list of array-like of shape - (n_leaf_node_samples, n_outputs) + leaf_nodes_samples : a list of array-like Each sample is represented by the indices of the training samples that reached the leaf node. The ``n_leaf_node_samples`` may vary between samples, since the number of samples that fall in a leaf node is - variable. + variable. Each array has shape (n_leaf_node_samples, n_outputs). """ if not self.store_leaf_values: raise RuntimeError( From 13e29135bd0b640f3bf325ec40a22a879096b719 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 16 Jun 2023 18:41:17 -0400 Subject: [PATCH 18/39] Try to fix docstring Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index c75c933c49b39..2d83a94dc8ec1 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1167,7 +1167,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): self : DecisionTreeClassifier Fitted estimator. """ - super()._fit( X, y, From 43aa3ef51ca96b58b00a178954d033579db09de9 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sat, 17 Jun 2023 10:41:44 -0400 Subject: [PATCH 19/39] Fix docstring Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index c51c489dbd5dd..5482ebcaf1d41 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -817,12 +817,11 @@ def get_leaf_node_samples(self, X): Returns ------- - leaf_node_samples : a list of array-like of shape - (n_leaf_node_samples, n_outputs) + leaf_node_samples : a list of array-like Each sample is represented by the indices of the training samples that reached the leaf node. The ``n_leaf_node_samples`` may vary between samples, since the number of samples that fall in a leaf node is - variable. + variable. Each array-like has shape (n_leaf_node_samples, n_outputs). """ check_is_fitted(self) # Check data From fe3072f4ee28f49d590e7b437bf01bffd61ab917 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sat, 17 Jun 2023 11:01:09 -0400 Subject: [PATCH 20/39] Fix docstring Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 5482ebcaf1d41..9fd3af21b1fd9 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -696,7 +696,6 @@ def _bin_data(self, X, is_training_data): If is_training_data, then fit the _bin_mapper attribute. Else, the binned data is converted to a C-contiguous array. """ - description = "training" if is_training_data else "validation" if self.verbose: print( From 2d4de9aff7567bf796626aed4f27149f6ccf399c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 19 Jun 2023 21:33:55 -0400 Subject: [PATCH 21/39] Fix the predict quantiles docstring Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 9fd3af21b1fd9..f85efb0b0a43b 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -733,9 +733,9 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"): Returns ------- - y : ndarray of shape (n_samples, n_quantiles) or - (n_samples, n_quantiles, n_outputs) - The predicted values. + y : ndarray of shape (n_samples, n_quantiles, [n_output]) + The predicted values. The ``n_outputs`` dimension is present only + for multi-output regressors. """ if not self.store_leaf_values: raise RuntimeError( From 1c1ec8cff3a181b7a86a4df8a2aeb01fa7cdbe6a Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 19 Jun 2023 21:35:33 -0400 Subject: [PATCH 22/39] Fix the predict quantiles docstring Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index f85efb0b0a43b..3eb61c9497918 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -733,7 +733,7 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"): Returns ------- - y : ndarray of shape (n_samples, n_quantiles, [n_output]) + y : ndarray of shape (n_samples, n_quantiles, [n_outputs]) The predicted values. The ``n_outputs`` dimension is present only for multi-output regressors. """ From 4bc651dd7916d7c267690ef0c9705b3f2d69c9d0 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Jun 2023 12:02:45 -0400 Subject: [PATCH 23/39] Remove some diff Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 1 - sklearn/tree/_criterion.pyx | 18 ++++++++++++++++++ sklearn/tree/_tree.pxd | 3 ++- sklearn/tree/_tree.pyx | 2 -- sklearn/tree/tests/test_tree.py | 8 +++++--- 5 files changed, 25 insertions(+), 7 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 74e60c64ce85f..e61f674d300c9 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -511,7 +511,6 @@ def _build_tree( self.min_impurity_decrease, self.store_leaf_values, ) - builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask) if self.n_outputs_ == 1 and is_classifier(self): diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 178a9adee9e80..2ddc02194c490 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -155,8 +155,10 @@ cdef class BaseCriterion: This method computes the improvement in impurity when a split occurs. The weighted impurity improvement equation is the following: + N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) + where N is the total number of samples, N_t is the number of samples at the current node, N_t_L is the number of samples in the left child, and N_t_R is the number of samples in the right child, @@ -165,8 +167,10 @@ cdef class BaseCriterion: ---------- impurity_parent : double The initial impurity of the parent node before the split + impurity_left : double The impurity of the left child + impurity_right : double The impurity of the right child @@ -611,10 +615,13 @@ cdef class Entropy(ClassificationCriterion): This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let + count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k) + be the proportion of class k observations in node m. The cross-entropy is then defined as + cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k) """ @@ -1058,10 +1065,14 @@ cdef class MSE(RegressionCriterion): The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. + The MSE proxy is derived from + sum_{i left}(y_i - y_pred_L)^2 + sum_{i right}(y_i - y_pred_R)^2 = sum(y_i^2) - n_L * mean_{i left}(y_i)^2 - n_R * mean_{i right}(y_i)^2 + Neglecting constant terms, this gives: + - 1/n_L * sum_{i left}(y_i)^2 - 1/n_R * sum_{i right}(y_i)^2 """ cdef SIZE_t k @@ -1139,6 +1150,7 @@ cdef class MAE(RegressionCriterion): ---------- n_outputs : SIZE_t The number of targets to be predicted + n_samples : SIZE_t The total number of samples to fit on """ @@ -1429,6 +1441,7 @@ cdef class FriedmanMSE(MSE): """Mean squared error impurity criterion with improvement score by Friedman. Uses the formula (35) in Friedman's original Gradient Boosting paper: + diff = mean_left - mean_right improvement = n_left * n_right * diff^2 / (n_left + n_right) """ @@ -1483,6 +1496,7 @@ cdef class Poisson(RegressionCriterion): """Half Poisson deviance as impurity criterion. Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true) + Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)` at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the implemented impurity (factor 2 is skipped): @@ -1519,12 +1533,16 @@ cdef class Poisson(RegressionCriterion): The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. + The Poisson proxy is derived from: + sum_{i left }(y_i * log(y_i / y_pred_L)) + sum_{i right}(y_i * log(y_i / y_pred_R)) = sum(y_i * log(y_i) - n_L * mean_{i left}(y_i) * log(mean_{i left}(y_i)) - n_R * mean_{i right}(y_i) * log(mean_{i right}(y_i)) + Neglecting constant terms, this gives + - sum{i left }(y_i) * log(mean{i left}(y_i)) - sum{i right}(y_i) * log(mean{i right}(y_i)) """ diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 7b933d905c79a..dedd820c41e0f 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -141,7 +141,8 @@ cdef class TreeBuilder: # This class controls the various stopping criteria and the node splitting # evaluation order, e.g. depth-first or best-first. - cdef Splitter splitter + cdef Splitter splitter # Splitting algorithm + cdef SIZE_t min_samples_split # Minimum number of samples in an internal node cdef SIZE_t min_samples_leaf # Minimum number of samples in a leaf cdef double min_weight_leaf # Minimum weight in a leaf diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 24b01b96aa726..c44022f54d3a5 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -61,7 +61,6 @@ cdef extern from "" namespace "std" nogil: from numpy import float32 as DTYPE from numpy import float64 as DOUBLE - cdef double INFINITY = np.inf cdef double EPSILON = np.finfo('double').eps @@ -87,7 +86,6 @@ NODE_DTYPE = np.asarray((&dummy)).dtype # TreeBuilder # ============================================================================= - cdef class TreeBuilder: """Interface for different tree building strategies.""" diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 792ba44b1302e..9be3dbd6f549e 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -33,13 +33,15 @@ DENSE_SPLITTERS, SPARSE_SPLITTERS, ) -from sklearn.tree._tree import NODE_DTYPE, TREE_LEAF, TREE_UNDEFINED -from sklearn.tree._tree import Tree as CythonTree from sklearn.tree._tree import ( + NODE_DTYPE, + TREE_LEAF, + TREE_UNDEFINED, _check_n_classes, _check_node_ndarray, _check_value_ndarray, ) +from sklearn.tree._tree import Tree as CythonTree from sklearn.utils import _IS_32BIT, compute_sample_weight from sklearn.utils._testing import ( assert_almost_equal, @@ -2424,7 +2426,7 @@ def test_missing_values_on_equal_nodes_no_missing(criterion): X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6]) - dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion) + dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True) dtc.fit(X, y) # Goes to right node because it has the most data points From cc035d04b9784e6facb7096a56c9c81801d819ec Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Jun 2023 15:42:08 -0400 Subject: [PATCH 24/39] Fix regression error Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 18 +++++++++--------- sklearn/tree/_criterion.pyx | 3 ++- sklearn/tree/_splitter.pyx | 6 ++++++ sklearn/tree/tests/test_tree.py | 4 +++- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index d8a94940799c0..f2e0201d534cd 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -40,27 +40,28 @@ class calls the ``fit`` method of each sub-estimator on random samples # License: BSD 3 clause -from time import time import threading from abc import ABCMeta, abstractmethod from numbers import Integral, Real +from time import time from warnings import catch_warnings, simplefilter, warn import numpy as np from scipy.sparse import hstack as sparse_hstack from scipy.sparse import issparse -from sklearn.base import is_classifier, _fit_context from sklearn.base import ( ClassifierMixin, MultiOutputMixin, RegressorMixin, TransformerMixin, + _fit_context, + is_classifier, ) - -from sklearn.metrics import accuracy_score, r2_score -from sklearn.preprocessing import OneHotEncoder +from sklearn.ensemble._base import BaseEnsemble, _partition_estimators +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from sklearn.exceptions import DataConversionWarning +from sklearn.metrics import accuracy_score, r2_score from sklearn.preprocessing import OneHotEncoder from sklearn.tree import ( BaseDecisionTree, @@ -69,8 +70,8 @@ class calls the ``fit`` method of each sub-estimator on random samples ExtraTreeClassifier, ExtraTreeRegressor, ) -from ..tree._tree import DOUBLE, DTYPE from sklearn.utils import check_random_state, compute_sample_weight +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions from sklearn.utils.multiclass import check_classification_targets, type_of_target from sklearn.utils.parallel import Parallel, delayed @@ -80,9 +81,8 @@ class calls the ``fit`` method of each sub-estimator on random samples _num_samples, check_is_fitted, ) -from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper -from sklearn.utils._openmp_helpers import _openmp_effective_n_threads -from sklearn.ensemble._base import BaseEnsemble, _partition_estimators + +from ..tree._tree import DOUBLE, DTYPE __all__ = [ "RandomForestClassifier", diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 2ddc02194c490..bd1bdef0a6a93 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1496,10 +1496,11 @@ cdef class Poisson(RegressionCriterion): """Half Poisson deviance as impurity criterion. Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true) - + Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)` at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the implemented impurity (factor 2 is skipped): + 1/n * sum(y_true * log(y_true/y_pred) """ # FIXME in 1.0: diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 007d55a589df7..bca38d5f04374 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -507,6 +507,12 @@ cdef inline int node_split_best( current_split.pos = p # Reject if min_samples_leaf is not guaranteed + if missing_go_to_left: + n_left = current_split.pos - splitter.start + n_missing + n_right = end_non_missing - current_split.pos + else: + n_left = current_split.pos - splitter.start + n_right = end_non_missing - current_split.pos + n_missing if splitter.check_presplit_conditions(current_split, n_missing, missing_go_to_left) == 1: continue diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 9be3dbd6f549e..0ce7a548c7bdb 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -2426,7 +2426,9 @@ def test_missing_values_on_equal_nodes_no_missing(criterion): X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6]) - dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True) + dtc = DecisionTreeRegressor( + random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True + ) dtc.fit(X, y) # Goes to right node because it has the most data points From 4840d4e3e3ef6175c4e1197c87c77f8fe06f10cf Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Jun 2023 18:26:04 -0400 Subject: [PATCH 25/39] Fix boolean Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 2 +- sklearn/tree/_classes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index f2e0201d534cd..b3feec10a3072 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -221,7 +221,7 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): None, Interval(Integral, 1, None, closed="left"), ], - "store_leaf_values": [bool], + "store_leaf_values": ["boolean"], } @abstractmethod diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index e61f674d300c9..6825c36df155c 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -123,7 +123,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None], "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")], "ccp_alpha": [Interval(Real, 0.0, None, closed="left")], - "store_leaf_values": [bool], + "store_leaf_values": ["boolean"], } @abstractmethod From fdf2e2dbe1e1c316a1e2987aea31da26ebbec2cd Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 30 Jun 2023 12:49:16 -0700 Subject: [PATCH 26/39] Added doc to store_leaf_values Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index b3feec10a3072..34bebab399566 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -822,6 +822,11 @@ def get_leaf_node_samples(self, X): samples, since the number of samples that fall in a leaf node is variable. Each array-like has shape (n_leaf_node_samples, n_outputs). """ + if not self.store_leaf_values: + raise RuntimeError( + "Leaf node samples are not available when store_leaf_values=False" + ) + check_is_fitted(self) # Check data X = self._validate_X_predict(X) @@ -1520,6 +1525,9 @@ class RandomForestClassifier(ForestClassifier): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier` @@ -1879,6 +1887,9 @@ class RandomForestRegressor(ForestRegressor): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.DecisionTreeRegressor` @@ -2232,6 +2243,9 @@ class ExtraTreesClassifier(ForestClassifier): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.ExtraTreesClassifier` @@ -2576,6 +2590,9 @@ class ExtraTreesRegressor(ForestRegressor): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` From 5b7ce7e1c6842aac174ebc4b1b2a68a1f1e25a7d Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 30 Jun 2023 12:51:20 -0700 Subject: [PATCH 27/39] Merging main Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 6825c36df155c..200f87b0b9ef3 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1386,6 +1386,16 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + Attributes ---------- feature_importances_ : ndarray of shape (n_features,) @@ -1713,6 +1723,16 @@ class ExtraTreeClassifier(DecisionTreeClassifier): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + Attributes ---------- classes_ : ndarray of shape (n_classes,) or list of ndarray @@ -1959,6 +1979,16 @@ class ExtraTreeRegressor(DecisionTreeRegressor): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + Attributes ---------- max_features_ : int From 9655d013870e3007d5c5a1898212a9d0eeea0968 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 30 Jun 2023 13:03:26 -0700 Subject: [PATCH 28/39] Fix now Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 34bebab399566..768eeeaf1959f 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -63,13 +63,6 @@ class calls the ``fit`` method of each sub-estimator on random samples from sklearn.exceptions import DataConversionWarning from sklearn.metrics import accuracy_score, r2_score from sklearn.preprocessing import OneHotEncoder -from sklearn.tree import ( - BaseDecisionTree, - DecisionTreeClassifier, - DecisionTreeRegressor, - ExtraTreeClassifier, - ExtraTreeRegressor, -) from sklearn.utils import check_random_state, compute_sample_weight from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions @@ -82,6 +75,13 @@ class calls the ``fit`` method of each sub-estimator on random samples check_is_fitted, ) +from ..tree import ( + BaseDecisionTree, + DecisionTreeClassifier, + DecisionTreeRegressor, + ExtraTreeClassifier, + ExtraTreeRegressor, +) from ..tree._tree import DOUBLE, DTYPE __all__ = [ From 6b57c5819782afec9ff5ac97e2c662bc8a66506d Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 5 Jul 2023 10:52:29 -0400 Subject: [PATCH 29/39] Bring in monotonicity (#47) #### Reference Issues/PRs #### What does this implement/fix? Explain your changes. #### Any other comments? Signed-off-by: Adam Li --- doc/whats_new/v1.4.rst | 32 ++ sklearn/ensemble/_forest.py | 97 +++- sklearn/ensemble/_gb.py | 1 + sklearn/feature_selection/_mutual_info.py | 5 +- .../tests/test_mutual_info.py | 15 + sklearn/pipeline.py | 20 +- sklearn/preprocessing/_data.py | 5 +- sklearn/tree/_classes.py | 128 ++++- sklearn/tree/_criterion.pxd | 22 + sklearn/tree/_criterion.pyx | 125 +++++ sklearn/tree/_splitter.pxd | 22 +- sklearn/tree/_splitter.pyx | 133 ++++- sklearn/tree/_tree.pyx | 192 ++++++- sklearn/tree/tests/test_monotonic_tree.py | 491 ++++++++++++++++++ sklearn/tree/tests/test_tree.py | 2 +- 15 files changed, 1231 insertions(+), 59 deletions(-) create mode 100644 sklearn/tree/tests/test_monotonic_tree.py diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 54aacb3988e81..6a5660ee27b2e 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -59,6 +59,31 @@ TODO: update at the time of the release. passed to the ``fit`` method of the the estimator. :pr:`26506` by `Adrin Jalali`_. + +:mod:`sklearn.ensemble` +....................... + +- |Feature| :class:`ensemble.RandomForestClassifier`, + :class:`ensemble.RandomForestRegressor`, :class:`ensemble.ExtraTreesClassifier` + and :class:`ensemble.ExtraTreesRegressor` now support monotonic constraints, + useful when features are supposed to have a positive/negative effect on the target. + Missing values in the train data and multi-output targets are not supported. + :pr:`13649` by :user:`Samuel Ronsin `, + initiated by :user:`Patrick O'Reilly `. + + +:mod:`sklearn.tree` +................... + +- |Feature| :class:`tree.DecisionTreeClassifier`, :class:`tree.DecisionTreeRegressor`, + :class:`tree.ExtraTreeClassifier` and :class:`tree.ExtraTreeRegressor` now support + monotonic constraints, useful when features are supposed to have a positive/negative + effect on the target. Missing values in the train data and multi-output targets are + not supported. + :pr:`13649` by :user:`Samuel Ronsin `, initiated by + :user:`Patrick O'Reilly `. + + :mod:`sklearn.decomposition` ............................ @@ -68,3 +93,10 @@ TODO: update at the time of the release. when using a custom initialization. The default value of this parameter will change from `None` to `auto` in version 1.6. :pr:`26634` by :user:`Alexandre Landeau ` and :user:`Alexandre Vigny `. + + +:mod:`sklearn.feature_selection` +................................ + +- |Fix| :func:`feature_selection.mutual_info_regression` now correctly computes the + result when `X` is of integer dtype. :pr:`26748` by :user:`Yao Xiao `. diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 768eeeaf1959f..47aa995ee51f3 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1525,9 +1525,31 @@ class RandomForestClassifier(ForestClassifier): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. + store_leaf_values : bool, default=False Whether to store the leaf values in the ``get_leaf_node_samples`` function. + monotonic_cst : array-like of int of shape (n_features), default=None + Indicates the monotonicity constraint to enforce on each feature. + - 1: monotonic increase + - 0: no constraint + - -1: monotonic decrease + + If monotonic_cst is None, no constraints are applied. + + Monotonicity constraints are not supported for: + - multiclass classifications (i.e. when `n_classes > 2`), + - multioutput classifications (i.e. when `n_outputs_ > 1`), + - classifications trained on data with missing values. + + The constraints hold over the probability of the positive class. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.4 + Attributes ---------- estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier` @@ -1670,6 +1692,7 @@ def __init__( max_samples=None, max_bins=None, store_leaf_values=False, + monotonic_cst=None, ): super().__init__( estimator=DecisionTreeClassifier(), @@ -1686,6 +1709,7 @@ def __init__( "random_state", "ccp_alpha", "store_leaf_values", + "monotonic_cst", ), bootstrap=bootstrap, oob_score=oob_score, @@ -1707,6 +1731,7 @@ def __init__( self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease + self.monotonic_cst = monotonic_cst self.ccp_alpha = ccp_alpha @@ -1887,9 +1912,29 @@ class RandomForestRegressor(ForestRegressor): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. Used for + speeding up training time. + store_leaf_values : bool, default=False Whether to store the leaf values in the ``get_leaf_node_samples`` function. + monotonic_cst : array-like of int of shape (n_features), default=None + Indicates the monotonicity constraint to enforce on each feature. + - 1: monotonically increasing + - 0: no constraint + - -1: monotonically decreasing + + If monotonic_cst is None, no constraints are applied. + + Monotonicity constraints are not supported for: + - multioutput regressions (i.e. when `n_outputs_ > 1`), + - regressions trained on data with missing values. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.4 + Attributes ---------- estimator_ : :class:`~sklearn.tree.DecisionTreeRegressor` @@ -2019,6 +2064,7 @@ def __init__( max_samples=None, max_bins=None, store_leaf_values=False, + monotonic_cst=None, ): super().__init__( estimator=DecisionTreeRegressor(), @@ -2035,6 +2081,7 @@ def __init__( "random_state", "ccp_alpha", "store_leaf_values", + "monotonic_cst", ), bootstrap=bootstrap, oob_score=oob_score, @@ -2056,6 +2103,7 @@ def __init__( self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.ccp_alpha = ccp_alpha + self.monotonic_cst = monotonic_cst class ExtraTreesClassifier(ForestClassifier): @@ -2242,10 +2290,32 @@ class ExtraTreesClassifier(ForestClassifier): `max_samples` should be in the interval `(0.0, 1.0]`. .. versionadded:: 0.22 + + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. store_leaf_values : bool, default=False Whether to store the leaf values in the ``get_leaf_node_samples`` function. + monotonic_cst : array-like of int of shape (n_features), default=None + Indicates the monotonicity constraint to enforce on each feature. + - 1: monotonically increasing + - 0: no constraint + - -1: monotonically decreasing + + If monotonic_cst is None, no constraints are applied. + + Monotonicity constraints are not supported for: + - multiclass classifications (i.e. when `n_classes > 2`), + - multioutput classifications (i.e. when `n_outputs_ > 1`), + - classifications trained on data with missing values. + + The constraints hold over the probability of the positive class. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.4 + Attributes ---------- estimator_ : :class:`~sklearn.tree.ExtraTreesClassifier` @@ -2377,6 +2447,7 @@ def __init__( max_samples=None, max_bins=None, store_leaf_values=False, + monotonic_cst=None, ): super().__init__( estimator=ExtraTreeClassifier(), @@ -2393,6 +2464,7 @@ def __init__( "random_state", "ccp_alpha", "store_leaf_values", + "monotonic_cst", ), bootstrap=bootstrap, oob_score=oob_score, @@ -2415,6 +2487,7 @@ def __init__( self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.ccp_alpha = ccp_alpha + self.monotonic_cst = monotonic_cst class ExtraTreesRegressor(ForestRegressor): @@ -2590,9 +2663,28 @@ class ExtraTreesRegressor(ForestRegressor): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. + store_leaf_values : bool, default=False Whether to store the leaf values in the ``get_leaf_node_samples`` function. + monotonic_cst : array-like of int of shape (n_features), default=None + Indicates the monotonicity constraint to enforce on each feature. + - 1: monotonically increasing + - 0: no constraint + - -1: monotonically decreasing + + If monotonic_cst is None, no constraints are applied. + + Monotonicity constraints are not supported for: + - multioutput regressions (i.e. when `n_outputs_ > 1`), + - regressions trained on data with missing values. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.4 + Attributes ---------- estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` @@ -2707,6 +2799,7 @@ def __init__( max_samples=None, max_bins=None, store_leaf_values=False, + monotonic_cst=None, ): super().__init__( estimator=ExtraTreeRegressor(), @@ -2723,6 +2816,7 @@ def __init__( "random_state", "ccp_alpha", "store_leaf_values", + "monotonic_cst", ), bootstrap=bootstrap, oob_score=oob_score, @@ -2744,6 +2838,7 @@ def __init__( self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.ccp_alpha = ccp_alpha + self.monotonic_cst = monotonic_cst class RandomTreesEmbedding(TransformerMixin, BaseForest): @@ -2937,7 +3032,7 @@ class RandomTreesEmbedding(TransformerMixin, BaseForest): **BaseDecisionTree._parameter_constraints, "sparse_output": ["boolean"], } - for param in ("max_features", "ccp_alpha", "splitter"): + for param in ("max_features", "ccp_alpha", "splitter", "monotonic_cst"): _parameter_constraints.pop(param) criterion = "squared_error" diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 21acb6bfe7693..3a14da52047ad 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -138,6 +138,7 @@ class BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta): } _parameter_constraints.pop("store_leaf_values") _parameter_constraints.pop("splitter") + _parameter_constraints.pop("monotonic_cst") @abstractmethod def __init__( diff --git a/sklearn/feature_selection/_mutual_info.py b/sklearn/feature_selection/_mutual_info.py index 78425ad6299d5..bd62495ac28a3 100644 --- a/sklearn/feature_selection/_mutual_info.py +++ b/sklearn/feature_selection/_mutual_info.py @@ -280,15 +280,12 @@ def _estimate_mi( rng = check_random_state(random_state) if np.any(continuous_mask): - if copy: - X = X.copy() - + X = X.astype(np.float64, copy=copy) X[:, continuous_mask] = scale( X[:, continuous_mask], with_mean=False, copy=False ) # Add small noise to continuous features as advised in Kraskov et. al. - X = X.astype(np.float64, copy=False) means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0)) X[:, continuous_mask] += ( 1e-10 diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py index f7b4af0a393f9..349147f66e36c 100644 --- a/sklearn/feature_selection/tests/test_mutual_info.py +++ b/sklearn/feature_selection/tests/test_mutual_info.py @@ -236,3 +236,18 @@ def test_mutual_information_symmetry_classif_regression(correlated, global_rando ) assert mi_classif == pytest.approx(mi_regression) + + +def test_mutual_info_regression_X_int_dtype(global_random_seed): + """Check that results agree when X is integer dtype and float dtype. + + Non-regression test for Issue #26696. + """ + rng = np.random.RandomState(global_random_seed) + X = rng.randint(100, size=(100, 10)) + X_float = X.astype(np.float64, copy=True) + y = rng.randint(100, size=100) + + expected = mutual_info_regression(X_float, y, random_state=global_random_seed) + result = mutual_info_regression(X, y, random_state=global_random_seed) + assert_allclose(result, expected) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 6fffd93c2a64c..26008c82fef11 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -131,10 +131,11 @@ class Pipeline(_BaseComposition): >>> pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())]) >>> # The pipeline can be used as any other estimator >>> # and avoids leaking the test set into the train set - >>> pipe.fit(X_train, y_train) - Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())]) - >>> pipe.score(X_test, y_test) + >>> pipe.fit(X_train, y_train).score(X_test, y_test) 0.88 + >>> # An estimator's parameter can be set using '__' syntax + >>> pipe.set_params(svc__C=10).fit(X_train, y_train).score(X_test, y_test) + 0.76 """ # BaseEstimator interface @@ -1051,6 +1052,10 @@ class FeatureUnion(TransformerMixin, _BaseComposition): >>> union.fit_transform(X) array([[ 1.5 , 3.0..., 0.8...], [-1.5 , 5.7..., -0.4...]]) + >>> # An estimator's parameter can be set using '__' syntax + >>> union.set_params(pca__n_components=1).fit_transform(X) + array([[ 1.5 , 3.0...], + [-1.5 , 5.7...]]) """ _required_parameters = ["transformer_list"] @@ -1362,11 +1367,12 @@ def __getitem__(self, name): def make_union(*transformers, n_jobs=None, verbose=False): - """Construct a FeatureUnion from the given transformers. + """Construct a :class:`FeatureUnion` from the given transformers. - This is a shorthand for the FeatureUnion constructor; it does not require, - and does not permit, naming the transformers. Instead, they will be given - names automatically based on their types. It also does not allow weighting. + This is a shorthand for the :class:`FeatureUnion` constructor; it does not + require, and does not permit, naming the transformers. Instead, they will + be given names automatically based on their types. It also does not allow + weighting. Parameters ---------- diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 56ae17f312fa2..197d709689daa 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -721,11 +721,12 @@ class StandardScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): mean_ : ndarray of shape (n_features,) or None The mean value for each feature in the training set. - Equal to ``None`` when ``with_mean=False``. + Equal to ``None`` when ``with_mean=False`` and ``with_std=False``. var_ : ndarray of shape (n_features,) or None The variance for each feature in the training set. Used to compute - `scale_`. Equal to ``None`` when ``with_std=False``. + `scale_`. Equal to ``None`` when ``with_mean=False`` and + ``with_std=False``. n_features_in_ : int Number of features seen during :term:`fit`. diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 200f87b0b9ef3..1b718f3a04052 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -124,6 +124,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")], "ccp_alpha": [Interval(Real, 0.0, None, closed="left")], "store_leaf_values": ["boolean"], + "monotonic_cst": ["array-like", None], } @abstractmethod @@ -143,6 +144,7 @@ def __init__( class_weight=None, ccp_alpha=0.0, store_leaf_values=False, + monotonic_cst=None, ): self.criterion = criterion self.splitter = splitter @@ -157,6 +159,7 @@ def __init__( self.class_weight = class_weight self.ccp_alpha = ccp_alpha self.store_leaf_values = store_leaf_values + self.monotonic_cst = monotonic_cst def get_depth(self): """Return the depth of the decision tree. @@ -184,7 +187,11 @@ def get_n_leaves(self): return self.tree_.n_leaves def _support_missing_values(self, X): - return not issparse(X) and self._get_tags()["allow_nan"] + return ( + not issparse(X) + and self._get_tags()["allow_nan"] + and self.monotonic_cst is None + ) def _compute_missing_values_in_feature_mask(self, X): """Return boolean mask denoting if there are missing values for each feature. @@ -469,7 +476,45 @@ def _build_tree( SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS - splitter = self.splitter + if self.monotonic_cst is None: + monotonic_cst = None + else: + if self.n_outputs_ > 1: + raise ValueError( + "Monotonicity constraints are not supported with multiple outputs." + ) + # Check to correct monotonicity constraint' specification, + # by applying element-wise logical conjunction + # Note: we do not cast `np.asarray(self.monotonic_cst, dtype=np.int8)` + # straight away here so as to generate error messages for invalid + # values using the original values prior to any dtype related conversion. + monotonic_cst = np.asarray(self.monotonic_cst) + if monotonic_cst.shape[0] != X.shape[1]: + raise ValueError( + "monotonic_cst has shape {} but the input data " + "X has {} features.".format(monotonic_cst.shape[0], X.shape[1]) + ) + valid_constraints = np.isin(monotonic_cst, (-1, 0, 1)) + if not np.all(valid_constraints): + unique_constaints_value = np.unique(monotonic_cst) + raise ValueError( + "monotonic_cst must be None or an array-like of -1, 0 or 1, but" + f" got {unique_constaints_value}" + ) + monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8) + if is_classifier(self): + if self.n_classes_[0] > 2: + raise ValueError( + "Monotonicity constraints are not supported with multiclass " + "classification" + ) + # Binary classification trees are built by constraining probabilities + # of the *negative class* in order to make the implementation similar + # to regression trees. + # Since self.monotonic_cst encodes constraints on probabilities of the + # *positive class*, all signs must be flipped. + monotonic_cst *= -1 + if not isinstance(self.splitter, BaseSplitter): splitter = SPLITTERS[self.splitter]( criterion, @@ -477,6 +522,7 @@ def _build_tree( min_samples_leaf, min_weight_leaf, random_state, + monotonic_cst, ) if is_classifier(self): @@ -1003,6 +1049,25 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): One could instead store the indices in ``y_train`` that fall into each leaf, which would lower RAM/diskspace usage. + monotonic_cst : array-like of int of shape (n_features), default=None + Indicates the monotonicity constraint to enforce on each feature. + - 1: monotonic increase + - 0: no constraint + - -1: monotonic decrease + + If monotonic_cst is None, no constraints are applied. + + Monotonicity constraints are not supported for: + - multiclass classifications (i.e. when `n_classes > 2`), + - multioutput classifications (i.e. when `n_outputs_ > 1`), + - classifications trained on data with missing values. + + The constraints hold over the probability of the positive class. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.4 + Attributes ---------- classes_ : ndarray of shape (n_classes,) or list of ndarray @@ -1121,6 +1186,7 @@ def __init__( class_weight=None, ccp_alpha=0.0, store_leaf_values=False, + monotonic_cst=None, ): super().__init__( criterion=criterion, @@ -1134,6 +1200,7 @@ def __init__( class_weight=class_weight, random_state=random_state, min_impurity_decrease=min_impurity_decrease, + monotonic_cst=monotonic_cst, ccp_alpha=ccp_alpha, store_leaf_values=store_leaf_values, ) @@ -1396,6 +1463,22 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): One could instead store the indices in ``y_train`` that fall into each leaf, which would lower RAM/diskspace usage. + monotonic_cst : array-like of int of shape (n_features), default=None + Indicates the monotonicity constraint to enforce on each feature. + - 1: monotonic increase + - 0: no constraint + - -1: monotonic decrease + + If monotonic_cst is None, no constraints are applied. + + Monotonicity constraints are not supported for: + - multioutput regressions (i.e. when `n_outputs_ > 1`), + - regressions trained on data with missing values. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.4 + Attributes ---------- feature_importances_ : ndarray of shape (n_features,) @@ -1495,6 +1578,7 @@ def __init__( min_impurity_decrease=0.0, ccp_alpha=0.0, store_leaf_values=False, + monotonic_cst=None, ): super().__init__( criterion=criterion, @@ -1509,6 +1593,7 @@ def __init__( min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, store_leaf_values=store_leaf_values, + monotonic_cst=monotonic_cst, ) @_fit_context(prefer_skip_nested_validation=True) @@ -1733,6 +1818,25 @@ class ExtraTreeClassifier(DecisionTreeClassifier): One could instead store the indices in ``y_train`` that fall into each leaf, which would lower RAM/diskspace usage. + monotonic_cst : array-like of int of shape (n_features), default=None + Indicates the monotonicity constraint to enforce on each feature. + - 1: monotonic increase + - 0: no constraint + - -1: monotonic decrease + + If monotonic_cst is None, no constraints are applied. + + Monotonicity constraints are not supported for: + - multiclass classifications (i.e. when `n_classes > 2`), + - multioutput classifications (i.e. when `n_outputs_ > 1`), + - classifications trained on data with missing values. + + The constraints hold over the probability of the positive class. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.4 + Attributes ---------- classes_ : ndarray of shape (n_classes,) or list of ndarray @@ -1834,6 +1938,7 @@ def __init__( class_weight=None, ccp_alpha=0.0, store_leaf_values=False, + monotonic_cst=None, ): super().__init__( criterion=criterion, @@ -1849,6 +1954,7 @@ def __init__( random_state=random_state, ccp_alpha=ccp_alpha, store_leaf_values=store_leaf_values, + monotonic_cst=monotonic_cst, ) @@ -1989,6 +2095,22 @@ class ExtraTreeRegressor(DecisionTreeRegressor): One could instead store the indices in ``y_train`` that fall into each leaf, which would lower RAM/diskspace usage. + monotonic_cst : array-like of int of shape (n_features), default=None + Indicates the monotonicity constraint to enforce on each feature. + - 1: monotonic increase + - 0: no constraint + - -1: monotonic decrease + + If monotonic_cst is None, no constraints are applied. + + Monotonicity constraints are not supported for: + - multioutput regressions (i.e. when `n_outputs_ > 1`), + - regressions trained on data with missing values. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.4 + Attributes ---------- max_features_ : int @@ -2073,6 +2195,7 @@ def __init__( max_leaf_nodes=None, ccp_alpha=0.0, store_leaf_values=False, + monotonic_cst=None, ): super().__init__( criterion=criterion, @@ -2087,4 +2210,5 @@ def __init__( random_state=random_state, ccp_alpha=ccp_alpha, store_leaf_values=store_leaf_values, + monotonic_cst=monotonic_cst, ) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index ecbf56e5f6016..f972cf2afc932 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -10,6 +10,7 @@ # License: BSD 3 clause # See _criterion.pyx for implementation details. +cimport numpy as cnp from libcpp.vector cimport vector @@ -58,6 +59,13 @@ cdef class BaseCriterion: self, double* dest ) noexcept nogil + cdef void clip_node_value( + self, + double* dest, + double lower_bound, + double upper_bound + ) noexcept nogil + cdef double middle_value(self) noexcept nogil cdef double impurity_improvement( self, double impurity_parent, @@ -65,6 +73,20 @@ cdef class BaseCriterion: double impurity_right ) noexcept nogil cdef double proxy_impurity_improvement(self) noexcept nogil + cdef bint check_monotonicity( + self, + cnp.int8_t monotonic_cst, + double lower_bound, + double upper_bound, + ) noexcept nogil + cdef inline bint _check_monotonicity( + self, + cnp.int8_t monotonic_cst, + double lower_bound, + double upper_bound, + double sum_left, + double sum_right, + ) noexcept nogil cdef void set_sample_pointers( self, diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index bd1bdef0a6a93..41ead9fdb70e2 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -130,6 +130,34 @@ cdef class BaseCriterion: """ pass + cdef void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil: + pass + + cdef double middle_value(self) noexcept nogil: + """Compute the middle value of a split for monotonicity constraints + + This method is implemented in ClassificationCriterion and RegressionCriterion. + """ + pass + + cdef bint check_monotonicity( + self, + cnp.int8_t monotonic_cst, + double lower_bound, + double upper_bound, + ) noexcept nogil: + pass + + cdef inline bint _check_monotonicity( + self, + cnp.int8_t monotonic_cst, + double lower_bound, + double upper_bound, + double value_left, + double value_right, + ) noexcept nogil: + pass + cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. @@ -608,6 +636,47 @@ cdef class ClassificationCriterion(Criterion): memcpy(dest, &self.sum_total[k, 0], self.n_classes[k] * sizeof(double)) dest += self.max_n_classes + cdef void clip_node_value(self, double * dest, double lower_bound, double upper_bound) noexcept nogil: + """Clip the value in dest between lower_bound and upper_bound for monotonic constraints. + + Note that monotonicity constraints are only supported for: + - single-output trees and + - binary classifications. + """ + if dest[0] < lower_bound: + dest[0] = lower_bound + elif dest[0] > upper_bound: + dest[0] = upper_bound + + # Class proportions for binary classification must sum to 1. + dest[1] = 1 - dest[0] + + cdef inline double middle_value(self) noexcept nogil: + """Compute the middle value of a split for monotonicity constraints as the simple average + of the left and right children values. + + Note that monotonicity constraints are only supported for: + - single-output trees and + - binary classifications. + """ + return ( + (self.sum_left[0, 0] / (2 * self.weighted_n_left)) + + (self.sum_right[0, 0] / (2 * self.weighted_n_right)) + ) + + cdef inline bint check_monotonicity( + self, + cnp.int8_t monotonic_cst, + double lower_bound, + double upper_bound, + ) noexcept nogil: + """Check monotonicity constraint is satisfied at the current classification split""" + cdef: + double value_left = self.sum_left[0][0] / self.weighted_n_left + double value_right = self.sum_right[0][0] / self.weighted_n_right + + return self._check_monotonicity(monotonic_cst, lower_bound, upper_bound, value_left, value_right) + cdef class Entropy(ClassificationCriterion): r"""Cross Entropy impurity criterion. @@ -1033,6 +1102,37 @@ cdef class RegressionCriterion(Criterion): for k in range(self.n_outputs): dest[k] = self.sum_total[k] / self.weighted_n_node_samples + cdef inline void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil: + """Clip the value in dest between lower_bound and upper_bound for monotonic constraints.""" + if dest[0] < lower_bound: + dest[0] = lower_bound + elif dest[0] > upper_bound: + dest[0] = upper_bound + + cdef double middle_value(self) noexcept nogil: + """Compute the middle value of a split for monotonicity constraints as the simple average + of the left and right children values. + + Monotonicity constraints are only supported for single-output trees we can safely assume + n_outputs == 1. + """ + return ( + (self.sum_left[0] / (2 * self.weighted_n_left)) + + (self.sum_right[0] / (2 * self.weighted_n_right)) + ) + + cdef bint check_monotonicity( + self, + cnp.int8_t monotonic_cst, + double lower_bound, + double upper_bound, + ) noexcept nogil: + """Check monotonicity constraint is satisfied at the current regression split""" + cdef: + double value_left = self.sum_left[0] / self.weighted_n_left + double value_right = self.sum_right[0] / self.weighted_n_right + + return self._check_monotonicity(monotonic_cst, lower_bound, upper_bound, value_left, value_right) cdef class MSE(RegressionCriterion): """Mean squared error impurity criterion. @@ -1365,6 +1465,31 @@ cdef class MAE(RegressionCriterion): for k in range(self.n_outputs): dest[k] = self.node_medians[k] + cdef inline double middle_value(self) noexcept nogil: + """Compute the middle value of a split for monotonicity constraints as the simple average + of the left and right children values. + + Monotonicity constraints are only supported for single-output trees we can safely assume + n_outputs == 1. + """ + return ( + ( self.left_child_ptr[0]).get_median() + + ( self.right_child_ptr[0]).get_median() + ) / 2 + + cdef inline bint check_monotonicity( + self, + cnp.int8_t monotonic_cst, + double lower_bound, + double upper_bound, + ) noexcept nogil: + """Check monotonicity constraint is satisfied at the current regression split""" + cdef: + double value_left = ( self.left_child_ptr[0]).get_median() + double value_right = ( self.right_child_ptr[0]).get_median() + + return self._check_monotonicity(monotonic_cst, lower_bound, upper_bound, value_left, value_right) + cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index a6515338c492d..3d8e4fd7510d7 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -10,6 +10,7 @@ # License: BSD 3 clause # See _splitter.pyx for details. +cimport numpy as cnp from libcpp.vector cimport vector @@ -31,6 +32,8 @@ cdef struct SplitRecord: double improvement # Impurity improvement given parent node. double impurity_left # Impurity of the left split. double impurity_right # Impurity of the right split. + double lower_bound # Lower bound on value of both children for monotonicity + double upper_bound # Upper bound on value of both children for monotonicity unsigned char missing_go_to_left # Controls if missing values go to the left node. SIZE_t n_missing # Number of missing values for the feature being split on @@ -61,6 +64,14 @@ cdef class BaseSplitter: cdef SIZE_t start # Start position for the current node cdef SIZE_t end # End position for the current node + # Monotonicity constraints for each feature. + # The encoding is as follows: + # -1: monotonic decrease + # 0: no constraint + # +1: monotonic increase + cdef const cnp.int8_t[:] monotonic_cst + cdef bint with_monotonic_cst + cdef const DOUBLE_t[:] sample_weight # The samples vector `samples` is maintained by the Splitter object such @@ -90,9 +101,17 @@ cdef class BaseSplitter: self, double impurity, # Impurity of the node SplitRecord* split, - SIZE_t* n_constant_features + SIZE_t* n_constant_features, + double lower_bound, + double upper_bound, ) except -1 nogil cdef void node_value(self, double* dest) noexcept nogil + cdef void clip_node_value( + self, + double* dest, + double lower_bound, + double upper_bound + ) noexcept nogil cdef double node_impurity(self) noexcept nogil cdef int pointer_size(self) noexcept nogil @@ -117,6 +136,7 @@ cdef class Splitter(BaseSplitter): SIZE_t n_missing, bint missing_go_to_left, ) noexcept nogil + cdef bint check_postsplit_conditions( self ) noexcept nogil diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index bca38d5f04374..a0fc6a7088e5c 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -21,6 +21,7 @@ from cython cimport final from libc.math cimport isnan from libc.stdlib cimport qsort from libc.string cimport memcpy +cimport numpy as cnp from ._criterion cimport Criterion @@ -88,8 +89,14 @@ cdef class BaseSplitter: """ pass - cdef int node_split(self, double impurity, SplitRecord* split, - SIZE_t* n_constant_features) except -1 nogil: + cdef int node_split( + self, + double impurity, + SplitRecord* split, + SIZE_t* n_constant_features, + double lower_bound, + double upper_bound + ) except -1 nogil: """Find the best split on node samples[start:end]. This is a placeholder method. The majority of computation will be done @@ -103,6 +110,10 @@ cdef class BaseSplitter: """Copy the value of node samples[start:end] into dest.""" pass + cdef inline void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil: + """Clip the value of node samples[start:end] into dest.""" + pass + cdef double node_impurity(self) noexcept nogil: """Return the impurity of the current node.""" pass @@ -118,9 +129,16 @@ cdef class BaseSplitter: cdef class Splitter(BaseSplitter): """Abstract interface for supervised splitters.""" - def __cinit__(self, Criterion criterion, SIZE_t max_features, - SIZE_t min_samples_leaf, double min_weight_leaf, - object random_state, *argv): + def __cinit__( + self, + Criterion criterion, + SIZE_t max_features, + SIZE_t min_samples_leaf, + double min_weight_leaf, + object random_state, + const cnp.int8_t[:] monotonic_cst, + *argv + ): """ Parameters ---------- @@ -142,6 +160,10 @@ cdef class Splitter(BaseSplitter): random_state : object The user inputted random state to be used for pseudo-randomness + + monotonic_cst : const cnp.int8_t[:] + Monotonicity constraints + """ self.criterion = criterion @@ -152,13 +174,16 @@ cdef class Splitter(BaseSplitter): self.min_samples_leaf = min_samples_leaf self.min_weight_leaf = min_weight_leaf self.random_state = random_state + self.monotonic_cst = monotonic_cst + self.with_monotonic_cst = monotonic_cst is not None def __reduce__(self): return (type(self), (self.criterion, self.max_features, self.min_samples_leaf, self.min_weight_leaf, - self.random_state), self.__getstate__()) + self.random_state, + self.monotonic_cst), self.__getstate__()) cdef int init( self, @@ -275,6 +300,11 @@ cdef class Splitter(BaseSplitter): """Copy the value of node samples[start:end] into dest.""" self.criterion.node_value(dest) + + cdef inline void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil: + """Clip the value in dest between lower_bound and upper_bound for monotonic constraints.""" + + self.criterion.clip_node_value(dest, lower_bound, upper_bound) cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil: """Copy the samples[start:end] into dest.""" @@ -367,6 +397,10 @@ cdef inline int node_split_best( double impurity, SplitRecord* split, SIZE_t* n_constant_features, + bint with_monotonic_cst, + const cnp.int8_t[:] monotonic_cst, + double lower_bound, + double upper_bound, ) except -1 nogil: """Find the best split on node samples[start:end] @@ -506,6 +540,18 @@ cdef inline int node_split_best( current_split.pos = p + # Reject if monotonicity constraints are not satisfied + if ( + with_monotonic_cst and + monotonic_cst[current_split.feature] != 0 and + not criterion.check_monotonicity( + monotonic_cst[current_split.feature], + lower_bound, + upper_bound, + ) + ): + continue + # Reject if min_samples_leaf is not guaranteed if missing_go_to_left: n_left = current_split.pos - splitter.start + n_missing @@ -729,7 +775,11 @@ cdef inline int node_split_random( Criterion criterion, double impurity, SplitRecord* split, - SIZE_t* n_constant_features + SIZE_t* n_constant_features, + bint with_monotonic_cst, + const cnp.int8_t[:] monotonic_cst, + double lower_bound, + double upper_bound, ) except -1 nogil: """Find the best random split on node samples[start:end] @@ -853,6 +903,18 @@ cdef inline int node_split_random( if splitter.check_postsplit_conditions() == 1: continue + # Reject if monotonicity constraints are not satisfied + if ( + with_monotonic_cst and + monotonic_cst[current_split.feature] != 0 and + not criterion.check_monotonicity( + monotonic_cst[current_split.feature], + lower_bound, + upper_bound, + ) + ): + continue + current_proxy_improvement = criterion.proxy_impurity_improvement() if current_proxy_improvement > best_proxy_improvement: @@ -1538,8 +1600,14 @@ cdef class BestSplitter(Splitter): X, self.samples, self.feature_values, missing_values_in_feature_mask ) - cdef int node_split(self, double impurity, SplitRecord* split, - SIZE_t* n_constant_features) except -1 nogil: + cdef int node_split( + self, + double impurity, + SplitRecord* split, + SIZE_t* n_constant_features, + double lower_bound, + double upper_bound + ) except -1 nogil: return node_split_best( self, self.partitioner, @@ -1547,6 +1615,10 @@ cdef class BestSplitter(Splitter): impurity, split, n_constant_features, + self.with_monotonic_cst, + self.monotonic_cst, + lower_bound, + upper_bound ) cdef class BestSparseSplitter(Splitter): @@ -1564,8 +1636,14 @@ cdef class BestSparseSplitter(Splitter): X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask ) - cdef int node_split(self, double impurity, SplitRecord* split, - SIZE_t* n_constant_features) except -1 nogil: + cdef int node_split( + self, + double impurity, + SplitRecord* split, + SIZE_t* n_constant_features, + double lower_bound, + double upper_bound + ) except -1 nogil: return node_split_best( self, self.partitioner, @@ -1573,6 +1651,10 @@ cdef class BestSparseSplitter(Splitter): impurity, split, n_constant_features, + self.with_monotonic_cst, + self.monotonic_cst, + lower_bound, + upper_bound ) cdef class RandomSplitter(Splitter): @@ -1590,8 +1672,14 @@ cdef class RandomSplitter(Splitter): X, self.samples, self.feature_values, missing_values_in_feature_mask ) - cdef int node_split(self, double impurity, SplitRecord* split, - SIZE_t* n_constant_features) except -1 nogil: + cdef int node_split( + self, + double impurity, + SplitRecord* split, + SIZE_t* n_constant_features, + double lower_bound, + double upper_bound + ) except -1 nogil: return node_split_random( self, self.partitioner, @@ -1599,6 +1687,10 @@ cdef class RandomSplitter(Splitter): impurity, split, n_constant_features, + self.with_monotonic_cst, + self.monotonic_cst, + lower_bound, + upper_bound ) cdef class RandomSparseSplitter(Splitter): @@ -1615,9 +1707,14 @@ cdef class RandomSparseSplitter(Splitter): self.partitioner = SparsePartitioner( X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask ) - - cdef int node_split(self, double impurity, SplitRecord* split, - SIZE_t* n_constant_features) except -1 nogil: + cdef int node_split( + self, + double impurity, + SplitRecord* split, + SIZE_t* n_constant_features, + double lower_bound, + double upper_bound + ) except -1 nogil: return node_split_random( self, self.partitioner, @@ -1625,4 +1722,8 @@ cdef class RandomSparseSplitter(Splitter): impurity, split, n_constant_features, + self.with_monotonic_cst, + self.monotonic_cst, + lower_bound, + upper_bound ) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index c44022f54d3a5..8dc4e94aa0732 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -149,6 +149,8 @@ cdef struct StackRecord: bint is_left double impurity SIZE_t n_constant_features + double lower_bound + double upper_bound cdef class DepthFirstTreeBuilder(TreeBuilder): """Build a decision tree in depth-first fashion.""" @@ -218,6 +220,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) cdef double impurity = INFINITY + cdef double lower_bound + cdef double upper_bound + cdef double middle_value cdef SIZE_t n_constant_features cdef bint is_leaf cdef bint first = 1 @@ -236,7 +241,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "parent": _TREE_UNDEFINED, "is_left": 0, "impurity": INFINITY, - "n_constant_features": 0}) + "n_constant_features": 0, + "lower_bound": -INFINITY, + "upper_bound": INFINITY, + }) while not builder_stack.empty(): stack_record = builder_stack.top() @@ -249,6 +257,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): is_left = stack_record.is_left impurity = stack_record.impurity n_constant_features = stack_record.n_constant_features + lower_bound = stack_record.lower_bound + upper_bound = stack_record.upper_bound n_node_samples = end - start splitter.node_reset(start, end, &weighted_n_node_samples) @@ -266,7 +276,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): is_leaf = is_leaf or impurity <= EPSILON if not is_leaf: - splitter.node_split(impurity, split_ptr, &n_constant_features) + splitter.node_split( + impurity, + split_ptr, + &n_constant_features, + lower_bound, + upper_bound + ) # assign local copy of SplitRecord to assign # pos, improvement, and impurity scores @@ -290,8 +306,42 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # Store value for all nodes, to facilitate tree/model # inspection and interpretation splitter.node_value(tree.value + node_id * tree.value_stride) + if splitter.with_monotonic_cst: + splitter.clip_node_value(tree.value + node_id * tree.value_stride, lower_bound, upper_bound) if not is_leaf: + if ( + not splitter.with_monotonic_cst or + splitter.monotonic_cst[split.feature] == 0 + ): + # Split on a feature with no monotonicity constraint + + # Current bounds must always be propagated to both children. + # If a monotonic constraint is active, bounds are used in + # node value clipping. + left_child_min = right_child_min = lower_bound + left_child_max = right_child_max = upper_bound + elif splitter.monotonic_cst[split.feature] == 1: + # Split on a feature with monotonic increase constraint + left_child_min = lower_bound + right_child_max = upper_bound + + # Lower bound for right child and upper bound for left child + # are set to the same value. + middle_value = splitter.criterion.middle_value() + right_child_min = middle_value + left_child_max = middle_value + else: # i.e. splitter.monotonic_cst[split.feature] == -1 + # Split on a feature with monotonic decrease constraint + right_child_min = lower_bound + left_child_max = upper_bound + + # Lower bound for left child and upper bound for right child + # are set to the same value. + middle_value = splitter.criterion.middle_value() + left_child_min = middle_value + right_child_max = middle_value + # Push right child on stack builder_stack.push({ "start": split.pos, @@ -300,7 +350,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "parent": node_id, "is_left": 0, "impurity": split.impurity_right, - "n_constant_features": n_constant_features}) + "n_constant_features": n_constant_features, + "lower_bound": right_child_min, + "upper_bound": right_child_max, + }) # Push left child on stack builder_stack.push({ @@ -310,7 +363,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "parent": node_id, "is_left": 1, "impurity": split.impurity_left, - "n_constant_features": n_constant_features}) + "n_constant_features": n_constant_features, + "n_constant_features": n_constant_features, + "lower_bound": left_child_min, + "upper_bound": left_child_max, + }) elif self.store_leaf_values and is_leaf: # copy leaf values to leaf_values array splitter.node_samples(tree.value_samples[node_id]) @@ -346,6 +403,9 @@ cdef struct FrontierRecord: double impurity_left double impurity_right double improvement + double lower_bound + double upper_bound + double middle_value cdef inline bool _compare_records( const FrontierRecord& left, @@ -414,6 +474,10 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef FrontierRecord record cdef FrontierRecord split_node_left cdef FrontierRecord split_node_right + cdef double left_child_min + cdef double left_child_max + cdef double right_child_min + cdef double right_child_max cdef SIZE_t n_node_samples = splitter.n_samples cdef SIZE_t max_split_nodes = max_leaf_nodes - 1 @@ -428,9 +492,20 @@ cdef class BestFirstTreeBuilder(TreeBuilder): with nogil: # add root to frontier - rc = self._add_split_node(splitter, tree, 0, n_node_samples, - INFINITY, IS_FIRST, IS_LEFT, NULL, 0, - &split_node_left) + rc = self._add_split_node( + splitter=splitter, + tree=tree, + start=0, + end=n_node_samples, + impurity=INFINITY, + is_first=IS_FIRST, + is_left=IS_LEFT, + parent=NULL, + depth=0, + lower_bound=-INFINITY, + upper_bound=INFINITY, + res=&split_node_left, + ) if rc >= 0: _add_to_frontier(split_node_left, frontier) @@ -455,16 +530,54 @@ cdef class BestFirstTreeBuilder(TreeBuilder): else: # Node is expandable + if ( + not splitter.with_monotonic_cst or + splitter.monotonic_cst[node.feature] == 0 + ): + # Split on a feature with no monotonicity constraint + + # Current bounds must always be propagated to both children. + # If a monotonic constraint is active, bounds are used in + # node value clipping. + left_child_min = right_child_min = record.lower_bound + left_child_max = right_child_max = record.upper_bound + elif splitter.monotonic_cst[node.feature] == 1: + # Split on a feature with monotonic increase constraint + left_child_min = record.lower_bound + right_child_max = record.upper_bound + + # Lower bound for right child and upper bound for left child + # are set to the same value. + right_child_min = record.middle_value + left_child_max = record.middle_value + else: # i.e. splitter.monotonic_cst[split.feature] == -1 + # Split on a feature with monotonic decrease constraint + right_child_min = record.lower_bound + left_child_max = record.upper_bound + + # Lower bound for left child and upper bound for right child + # are set to the same value. + left_child_min = record.middle_value + right_child_max = record.middle_value + # Decrement number of split nodes available max_split_nodes -= 1 # Compute left split node - rc = self._add_split_node(splitter, tree, - record.start, record.pos, - record.impurity_left, - IS_NOT_FIRST, IS_LEFT, node, - record.depth + 1, - &split_node_left) + rc = self._add_split_node( + splitter=splitter, + tree=tree, + start=record.start, + end=record.pos, + impurity=record.impurity_left, + is_first=IS_NOT_FIRST, + is_left=IS_LEFT, + parent=node, + depth=record.depth + 1, + lower_bound=left_child_min, + upper_bound=left_child_max, + res=&split_node_left, + ) if rc == -1: break @@ -472,12 +585,20 @@ cdef class BestFirstTreeBuilder(TreeBuilder): node = &tree.nodes[record.node_id] # Compute right split node - rc = self._add_split_node(splitter, tree, record.pos, - record.end, - record.impurity_right, - IS_NOT_FIRST, IS_NOT_LEFT, node, - record.depth + 1, - &split_node_right) + rc = self._add_split_node( + splitter=splitter, + tree=tree, + start=record.pos, + end=record.end, + impurity=record.impurity_right, + is_first=IS_NOT_FIRST, + is_left=IS_NOT_LEFT, + parent=node, + depth=record.depth + 1, + lower_bound=right_child_min, + upper_bound=right_child_max, + res=&split_node_right, + ) if rc == -1: break @@ -497,11 +618,21 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if rc == -1: raise MemoryError() - cdef inline int _add_split_node(self, Splitter splitter, Tree tree, - SIZE_t start, SIZE_t end, double impurity, - bint is_first, bint is_left, Node* parent, - SIZE_t depth, - FrontierRecord* res) except -1 nogil: + cdef inline int _add_split_node( + self, + Splitter splitter, + Tree tree, + SIZE_t start, + SIZE_t end, + double impurity, + bint is_first, + bint is_left, + Node* parent, + SIZE_t depth, + double lower_bound, + double upper_bound, + FrontierRecord* res + ) nogil except -1: """Adds node w/ partition ``[start, end)`` to the frontier. """ cdef SplitRecord split cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) @@ -527,7 +658,13 @@ cdef class BestFirstTreeBuilder(TreeBuilder): ) if not is_leaf: - splitter.node_split(impurity, split_ptr, &n_constant_features) + splitter.node_split( + impurity, + split_ptr, + &n_constant_features, + lower_bound, + upper_bound + ) # assign local copy of SplitRecord to assign # pos, improvement, and impurity scores split = deref(split_ptr) @@ -548,12 +685,17 @@ cdef class BestFirstTreeBuilder(TreeBuilder): # compute values also for split nodes (might become leafs later). splitter.node_value(tree.value + node_id * tree.value_stride) + if splitter.with_monotonic_cst: + splitter.clip_node_value(tree.value + node_id * tree.value_stride, lower_bound, upper_bound) res.node_id = node_id res.start = start res.end = end res.depth = depth res.impurity = impurity + res.lower_bound = lower_bound + res.upper_bound = upper_bound + res.middle_value = splitter.criterion.middle_value() if not is_leaf: # is split node diff --git a/sklearn/tree/tests/test_monotonic_tree.py b/sklearn/tree/tests/test_monotonic_tree.py new file mode 100644 index 0000000000000..462ac7305d7c2 --- /dev/null +++ b/sklearn/tree/tests/test_monotonic_tree.py @@ -0,0 +1,491 @@ +import numpy as np +import pytest +import scipy.sparse + +from sklearn.datasets import make_classification, make_regression +from sklearn.ensemble import ( + ExtraTreesClassifier, + ExtraTreesRegressor, + RandomForestClassifier, + RandomForestRegressor, +) +from sklearn.tree import ( + DecisionTreeClassifier, + DecisionTreeRegressor, + ExtraTreeClassifier, + ExtraTreeRegressor, +) + +TREE_CLASSIFIER_CLASSES = [DecisionTreeClassifier, ExtraTreeClassifier] +TREE_REGRESSOR_CLASSES = [DecisionTreeRegressor, ExtraTreeRegressor] +TREE_BASED_CLASSIFIER_CLASSES = TREE_CLASSIFIER_CLASSES + [ + RandomForestClassifier, + ExtraTreesClassifier, +] +TREE_BASED_REGRESSOR_CLASSES = TREE_REGRESSOR_CLASSES + [ + RandomForestRegressor, + ExtraTreesRegressor, +] + + +@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES) +@pytest.mark.parametrize("depth_first_builder", (True, False)) +@pytest.mark.parametrize("sparse_splitter", (True, False)) +def test_monotonic_constraints_classifications( + TreeClassifier, depth_first_builder, sparse_splitter, global_random_seed +): + n_samples = 1000 + n_samples_train = 900 + X, y = make_classification( + n_samples=n_samples, + n_classes=2, + n_features=5, + n_informative=5, + n_redundant=0, + random_state=global_random_seed, + ) + X_train, y_train = X[:n_samples_train], y[:n_samples_train] + X_test, _ = X[n_samples_train:], y[n_samples_train:] + + X_test_0incr, X_test_0decr = np.copy(X_test), np.copy(X_test) + X_test_1incr, X_test_1decr = np.copy(X_test), np.copy(X_test) + X_test_0incr[:, 0] += 10 + X_test_0decr[:, 0] -= 10 + X_test_1incr[:, 1] += 10 + X_test_1decr[:, 1] -= 10 + monotonic_cst = np.zeros(X.shape[1]) + monotonic_cst[0] = 1 + monotonic_cst[1] = -1 + + if depth_first_builder: + est = TreeClassifier(max_depth=None, monotonic_cst=monotonic_cst) + else: + est = TreeClassifier( + max_depth=None, + monotonic_cst=monotonic_cst, + max_leaf_nodes=n_samples_train, + ) + if hasattr(est, "random_state"): + est.set_params(**{"random_state": global_random_seed}) + if hasattr(est, "n_estimators"): + est.set_params(**{"n_estimators": 5}) + if sparse_splitter: + X_train = scipy.sparse.csc_matrix(X_train) + est.fit(X_train, y_train) + y = est.predict_proba(X_test)[:, 1] + + # Monotonic increase constraint, it applies to the positive class + assert np.all(est.predict_proba(X_test_0incr)[:, 1] >= y) + assert np.all(est.predict_proba(X_test_0decr)[:, 1] <= y) + + # Monotonic decrease constraint, it applies to the positive class + assert np.all(est.predict_proba(X_test_1incr)[:, 1] <= y) + assert np.all(est.predict_proba(X_test_1decr)[:, 1] >= y) + + +@pytest.mark.parametrize("TreeRegressor", TREE_BASED_REGRESSOR_CLASSES) +@pytest.mark.parametrize("depth_first_builder", (True, False)) +@pytest.mark.parametrize("sparse_splitter", (True, False)) +@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error")) +def test_monotonic_constraints_regressions( + TreeRegressor, depth_first_builder, sparse_splitter, criterion, global_random_seed +): + n_samples = 1000 + n_samples_train = 900 + # Build a regression task using 5 informative features + X, y = make_regression( + n_samples=n_samples, + n_features=5, + n_informative=5, + random_state=global_random_seed, + ) + train = np.arange(n_samples_train) + test = np.arange(n_samples_train, n_samples) + X_train = X[train] + y_train = y[train] + X_test = np.copy(X[test]) + X_test_incr = np.copy(X_test) + X_test_decr = np.copy(X_test) + X_test_incr[:, 0] += 10 + X_test_decr[:, 1] += 10 + monotonic_cst = np.zeros(X.shape[1]) + monotonic_cst[0] = 1 + monotonic_cst[1] = -1 + + if depth_first_builder: + est = TreeRegressor( + max_depth=None, + monotonic_cst=monotonic_cst, + criterion=criterion, + ) + else: + est = TreeRegressor( + max_depth=8, + monotonic_cst=monotonic_cst, + criterion=criterion, + max_leaf_nodes=n_samples_train, + ) + if hasattr(est, "random_state"): + est.set_params(random_state=global_random_seed) + if hasattr(est, "n_estimators"): + est.set_params(**{"n_estimators": 5}) + if sparse_splitter: + X_train = scipy.sparse.csc_matrix(X_train) + est.fit(X_train, y_train) + y = est.predict(X_test) + # Monotonic increase constraint + y_incr = est.predict(X_test_incr) + # y_incr should always be greater than y + assert np.all(y_incr >= y) + + # Monotonic decrease constraint + y_decr = est.predict(X_test_decr) + # y_decr should always be lower than y + assert np.all(y_decr <= y) + + +@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES) +def test_multiclass_raises(TreeClassifier): + X, y = make_classification( + n_samples=100, n_features=5, n_classes=3, n_informative=3, random_state=0 + ) + y[0] = 0 + monotonic_cst = np.zeros(X.shape[1]) + monotonic_cst[0] = -1 + monotonic_cst[1] = 1 + est = TreeClassifier(max_depth=None, monotonic_cst=monotonic_cst, random_state=0) + + msg = "Monotonicity constraints are not supported with multiclass classification" + with pytest.raises(ValueError, match=msg): + est.fit(X, y) + + +@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES) +def test_multiple_output_raises(TreeClassifier): + X = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]] + y = [[1, 0, 1, 0, 1], [1, 0, 1, 0, 1]] + + est = TreeClassifier( + max_depth=None, monotonic_cst=np.array([-1, 1]), random_state=0 + ) + msg = "Monotonicity constraints are not supported with multiple output" + with pytest.raises(ValueError, match=msg): + est.fit(X, y) + + +@pytest.mark.parametrize( + "DecisionTreeEstimator", [DecisionTreeClassifier, DecisionTreeRegressor] +) +def test_missing_values_raises(DecisionTreeEstimator): + X, y = make_classification( + n_samples=100, n_features=5, n_classes=2, n_informative=3, random_state=0 + ) + X[0, 0] = np.nan + monotonic_cst = np.zeros(X.shape[1]) + monotonic_cst[0] = 1 + est = DecisionTreeEstimator( + max_depth=None, monotonic_cst=monotonic_cst, random_state=0 + ) + + msg = "Input X contains NaN" + with pytest.raises(ValueError, match=msg): + est.fit(X, y) + + +@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES) +def test_bad_monotonic_cst_raises(TreeClassifier): + X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] + y = [1, 0, 1, 0, 1] + + msg = "monotonic_cst has shape 3 but the input data X has 2 features." + est = TreeClassifier( + max_depth=None, monotonic_cst=np.array([-1, 1, 0]), random_state=0 + ) + with pytest.raises(ValueError, match=msg): + est.fit(X, y) + + msg = "monotonic_cst must be None or an array-like of -1, 0 or 1." + est = TreeClassifier( + max_depth=None, monotonic_cst=np.array([-2, 2]), random_state=0 + ) + with pytest.raises(ValueError, match=msg): + est.fit(X, y) + + est = TreeClassifier( + max_depth=None, monotonic_cst=np.array([-1, 0.8]), random_state=0 + ) + with pytest.raises(ValueError, match=msg + "(.*)0.8]"): + est.fit(X, y) + + +def assert_1d_reg_tree_children_monotonic_bounded(tree_, monotonic_sign): + values = tree_.value + for i in range(tree_.node_count): + if tree_.children_left[i] > i and tree_.children_right[i] > i: + # Check monotonicity on children + i_left = tree_.children_left[i] + i_right = tree_.children_right[i] + if monotonic_sign == 1: + assert values[i_left] <= values[i_right] + elif monotonic_sign == -1: + assert values[i_left] >= values[i_right] + val_middle = (values[i_left] + values[i_right]) / 2 + # Check bounds on grand-children, filtering out leaf nodes + if tree_.feature[i_left] >= 0: + i_left_right = tree_.children_right[i_left] + if monotonic_sign == 1: + assert values[i_left_right] <= val_middle + elif monotonic_sign == -1: + assert values[i_left_right] >= val_middle + if tree_.feature[i_right] >= 0: + i_right_left = tree_.children_left[i_right] + if monotonic_sign == 1: + assert val_middle <= values[i_right_left] + elif monotonic_sign == -1: + assert val_middle >= values[i_right_left] + + +def test_assert_1d_reg_tree_children_monotonic_bounded(): + X = np.linspace(-1, 1, 7).reshape(-1, 1) + y = np.sin(2 * np.pi * X.ravel()) + + reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y) + + with pytest.raises(AssertionError): + assert_1d_reg_tree_children_monotonic_bounded(reg.tree_, 1) + + with pytest.raises(AssertionError): + assert_1d_reg_tree_children_monotonic_bounded(reg.tree_, -1) + + +def assert_1d_reg_monotonic(clf, monotonic_sign, min_x, max_x, n_steps): + X_grid = np.linspace(min_x, max_x, n_steps).reshape(-1, 1) + y_pred_grid = clf.predict(X_grid) + if monotonic_sign == 1: + assert (np.diff(y_pred_grid) >= 0.0).all() + elif monotonic_sign == -1: + assert (np.diff(y_pred_grid) <= 0.0).all() + + +@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES) +def test_1d_opposite_monotonicity_cst_data(TreeRegressor): + # Check that positive monotonic data with negative monotonic constraint + # yield constant predictions, equal to the average of target values + X = np.linspace(-2, 2, 10).reshape(-1, 1) + y = X.ravel() + clf = TreeRegressor(monotonic_cst=[-1]) + clf.fit(X, y) + assert clf.tree_.node_count == 1 + assert clf.tree_.value[0] == 0.0 + + # Swap monotonicity + clf = TreeRegressor(monotonic_cst=[1]) + clf.fit(X, -y) + assert clf.tree_.node_count == 1 + assert clf.tree_.value[0] == 0.0 + + +@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES) +@pytest.mark.parametrize("monotonic_sign", (-1, 1)) +@pytest.mark.parametrize("depth_first_builder", (True, False)) +@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error")) +def test_1d_tree_nodes_values( + TreeRegressor, monotonic_sign, depth_first_builder, criterion, global_random_seed +): + # Adaptation from test_nodes_values in test_monotonic_constraints.py + # in sklearn.ensemble._hist_gradient_boosting + # Build a single tree with only one feature, and make sure the node + # values respect the monotonicity constraints. + + # Considering the following tree with a monotonic +1 constraint, we + # should have: + # + # root + # / \ + # a b + # / \ / \ + # c d e f + # + # a <= root <= b + # c <= d <= (a + b) / 2 <= e <= f + + rng = np.random.RandomState(global_random_seed) + n_samples = 1000 + n_features = 1 + X = rng.rand(n_samples, n_features) + y = rng.rand(n_samples) + + if depth_first_builder: + # No max_leaf_nodes, default depth first tree builder + clf = TreeRegressor( + monotonic_cst=[monotonic_sign], + criterion=criterion, + random_state=global_random_seed, + ) + else: + # max_leaf_nodes triggers best first tree builder + clf = TreeRegressor( + monotonic_cst=[monotonic_sign], + max_leaf_nodes=n_samples, + criterion=criterion, + random_state=global_random_seed, + ) + clf.fit(X, y) + + assert_1d_reg_tree_children_monotonic_bounded(clf.tree_, monotonic_sign) + assert_1d_reg_monotonic(clf, monotonic_sign, np.min(X), np.max(X), 100) + + +def assert_nd_reg_tree_children_monotonic_bounded(tree_, monotonic_cst): + upper_bound = np.full(tree_.node_count, np.inf) + lower_bound = np.full(tree_.node_count, -np.inf) + for i in range(tree_.node_count): + feature = tree_.feature[i] + node_value = tree_.value[i][0][0] # unpack value from nx1x1 array + # While building the tree, the computed middle value is slightly + # different from the average of the siblings values, because + # sum_right / weighted_n_right + # is slightly different from the value of the right sibling. + # This can cause a discrepancy up to numerical noise when clipping, + # which is resolved by comparing with some loss of precision. + assert np.float32(node_value) <= np.float32(upper_bound[i]) + assert np.float32(node_value) >= np.float32(lower_bound[i]) + + if feature < 0: + # Leaf: nothing to do + continue + + # Split node: check and update bounds for the children. + i_left = tree_.children_left[i] + i_right = tree_.children_right[i] + # unpack value from nx1x1 array + middle_value = (tree_.value[i_left][0][0] + tree_.value[i_right][0][0]) / 2 + + if monotonic_cst[feature] == 0: + # Feature without monotonicity constraint: propagate bounds + # down the tree to both children. + # Otherwise, with 2 features and a monotonic increase constraint + # (encoded by +1) on feature 0, the following tree can be accepted, + # although it does not respect the monotonic increase constraint: + # + # X[0] <= 0 + # value = 100 + # / \ + # X[0] <= -1 X[1] <= 0 + # value = 50 value = 150 + # / \ / \ + # leaf leaf leaf leaf + # value = 25 value = 75 value = 50 value = 250 + + lower_bound[i_left] = lower_bound[i] + upper_bound[i_left] = upper_bound[i] + lower_bound[i_right] = lower_bound[i] + upper_bound[i_right] = upper_bound[i] + + elif monotonic_cst[feature] == 1: + # Feature with constraint: check monotonicity + assert tree_.value[i_left] <= tree_.value[i_right] + + # Propagate bounds down the tree to both children. + lower_bound[i_left] = lower_bound[i] + upper_bound[i_left] = middle_value + lower_bound[i_right] = middle_value + upper_bound[i_right] = upper_bound[i] + + elif monotonic_cst[feature] == -1: + # Feature with constraint: check monotonicity + assert tree_.value[i_left] >= tree_.value[i_right] + + # Update and propagate bounds down the tree to both children. + lower_bound[i_left] = middle_value + upper_bound[i_left] = upper_bound[i] + lower_bound[i_right] = lower_bound[i] + upper_bound[i_right] = middle_value + + else: # pragma: no cover + raise ValueError(f"monotonic_cst[{feature}]={monotonic_cst[feature]}") + + +def test_assert_nd_reg_tree_children_monotonic_bounded(): + # Check that assert_nd_reg_tree_children_monotonic_bounded can detect + # non-monotonic tree predictions. + X = np.linspace(0, 2 * np.pi, 30).reshape(-1, 1) + y = np.sin(X).ravel() + reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y) + + with pytest.raises(AssertionError): + assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [1]) + + with pytest.raises(AssertionError): + assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [-1]) + + assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [0]) + + # Check that assert_nd_reg_tree_children_monotonic_bounded raises + # when the data (and therefore the model) is naturally monotonic in the + # opposite direction. + X = np.linspace(-5, 5, 5).reshape(-1, 1) + y = X.ravel() ** 3 + reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y) + + with pytest.raises(AssertionError): + assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [-1]) + + # For completeness, check that the converse holds when swapping the sign. + reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, -y) + + with pytest.raises(AssertionError): + assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [1]) + + +@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES) +@pytest.mark.parametrize("monotonic_sign", (-1, 1)) +@pytest.mark.parametrize("depth_first_builder", (True, False)) +@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error")) +def test_nd_tree_nodes_values( + TreeRegressor, monotonic_sign, depth_first_builder, criterion, global_random_seed +): + # Build tree with several features, and make sure the nodes + # values respect the monotonicity constraints. + + # Considering the following tree with a monotonic increase constraint on X[0], + # we should have: + # + # root + # X[0]<=t + # / \ + # a b + # X[0]<=u X[1]<=v + # / \ / \ + # c d e f + # + # i) a <= root <= b + # ii) c <= a <= d <= (a+b)/2 + # iii) (a+b)/2 <= min(e,f) + # For iii) we check that each node value is within the proper lower and + # upper bounds. + + rng = np.random.RandomState(global_random_seed) + n_samples = 1000 + n_features = 2 + monotonic_cst = [monotonic_sign, 0] + X = rng.rand(n_samples, n_features) + y = rng.rand(n_samples) + + if depth_first_builder: + # No max_leaf_nodes, default depth first tree builder + clf = TreeRegressor( + monotonic_cst=monotonic_cst, + criterion=criterion, + random_state=global_random_seed, + ) + else: + # max_leaf_nodes triggers best first tree builder + clf = TreeRegressor( + monotonic_cst=monotonic_cst, + max_leaf_nodes=n_samples, + criterion=criterion, + random_state=global_random_seed, + ) + clf.fit(X, y) + assert_nd_reg_tree_children_monotonic_bounded(clf.tree_, monotonic_cst) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 0ce7a548c7bdb..ccca6d60ed48b 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -2371,7 +2371,7 @@ def test_splitter_serializable(Splitter): n_outputs, n_classes = 2, np.array([3, 2], dtype=np.intp) criterion = CRITERIA_CLF["gini"](n_outputs, n_classes) - splitter = Splitter(criterion, max_features, 5, 0.5, rng) + splitter = Splitter(criterion, max_features, 5, 0.5, rng, monotonic_cst=None) splitter_serialize = pickle.dumps(splitter) splitter_back = pickle.loads(splitter_serialize) From 34e540a98d3103388003ff8aa3bb2066404a5f31 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 5 Jul 2023 11:00:37 -0400 Subject: [PATCH 30/39] Fix splitter Signed-off-by: Adam Li --- sklearn/tree/_splitter.pyx | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index af59a594793b8..429f49947c47d 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -110,7 +110,7 @@ cdef class BaseSplitter: """Copy the value of node samples[start:end] into dest.""" pass - cdef inline void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil: + cdef void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil: """Clip the value of node samples[start:end] into dest.""" pass @@ -310,11 +310,6 @@ cdef class Splitter(BaseSplitter): """Copy the samples[start:end] into dest.""" self.criterion.node_samples(dest) - cdef inline void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil: - """Clip the value in dest between lower_bound and upper_bound for monotonic constraints.""" - - self.criterion.clip_node_value(dest, lower_bound, upper_bound) - cdef double node_impurity(self) noexcept nogil: """Return the impurity of the current node.""" From e9d702b8a38824f67752b9a5a6aefb964b511551 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 19 Jul 2023 16:19:49 -0400 Subject: [PATCH 31/39] Fix linter Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 1d3b247bd4586..7e494b0e9bccc 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -2290,7 +2290,7 @@ class ExtraTreesClassifier(ForestClassifier): `max_samples` should be in the interval `(0.0, 1.0]`. .. versionadded:: 0.22 - + max_bins : int, default=255 The maximum number of bins to use for non-missing values. From ce6a727f9ddd814c90997eae2e7aae441566a18f Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 20 Jul 2023 10:56:41 -0400 Subject: [PATCH 32/39] Fix linting Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 7e494b0e9bccc..7f2c5d44c2c1a 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -777,16 +777,21 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"): # (n_total_leaf_samples, n_outputs) leaf_node_samples = np.vstack( - ( + [ est.leaf_nodes_samples_[leaf_nodes[jdx]] for jdx, est in enumerate(self.estimators_) - ) + ] ) # get quantiles across all leaf node samples - y_hat[idx, ...] = np.quantile( - leaf_node_samples, quantiles, axis=0, interpolation=method - ) + try: + y_hat[idx, ...] = np.quantile( + leaf_node_samples, quantiles, axis=0, method=method + ) + except TypeError: + y_hat[idx, ...] = np.quantile( + leaf_node_samples, quantiles, axis=0, interpolation=method + ) if is_classifier(self): if self.n_outputs_ == 1: From 00a3595b973f01e25d9ee50eedd0504b89096c8e Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 20 Jul 2023 11:21:20 -0400 Subject: [PATCH 33/39] Fix docstring Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 7f2c5d44c2c1a..4cad13b6c7658 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1533,9 +1533,13 @@ class RandomForestClassifier(ForestClassifier): max_bins : int, default=255 The maximum number of bins to use for non-missing values. + **Experimental feature** + store_leaf_values : bool, default=False Whether to store the leaf values in the ``get_leaf_node_samples`` function. + **Experimental feature** + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -1921,9 +1925,13 @@ class RandomForestRegressor(ForestRegressor): The maximum number of bins to use for non-missing values. Used for speeding up training time. + **Experimental feature** + store_leaf_values : bool, default=False Whether to store the leaf values in the ``get_leaf_node_samples`` function. + **Experimental feature** + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -2299,9 +2307,13 @@ class ExtraTreesClassifier(ForestClassifier): max_bins : int, default=255 The maximum number of bins to use for non-missing values. + **Experimental feature** + store_leaf_values : bool, default=False Whether to store the leaf values in the ``get_leaf_node_samples`` function. + **Experimental feature** + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -2690,9 +2702,13 @@ class ExtraTreesRegressor(ForestRegressor): max_bins : int, default=255 The maximum number of bins to use for non-missing values. + **Experimental feature** + store_leaf_values : bool, default=False Whether to store the leaf values in the ``get_leaf_node_samples`` function. + **Experimental feature** + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing From 329cbc89ff19991429f042b72909bfd412ed4c63 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 20 Jul 2023 13:25:00 -0400 Subject: [PATCH 34/39] Fix lint Signed-off-by: Adam Li --- min_dependency_substitutions.rst | 28 ++++++++++++++++++++++++++++ min_dependency_table.rst | 32 ++++++++++++++++++++++++++++++++ sklearn/ensemble/_forest.py | 2 +- 3 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 min_dependency_substitutions.rst create mode 100644 min_dependency_table.rst diff --git a/min_dependency_substitutions.rst b/min_dependency_substitutions.rst new file mode 100644 index 0000000000000..575b003b15a32 --- /dev/null +++ b/min_dependency_substitutions.rst @@ -0,0 +1,28 @@ +.. |NumpyMinVersion| replace:: 1.17.3 +.. |ScipyMinVersion| replace:: 1.5.0 +.. |JoblibMinVersion| replace:: 1.1.1 +.. |ThreadpoolctlMinVersion| replace:: 2.0.0 +.. |CythonMinVersion| replace:: 0.29.33 +.. |MatplotlibMinVersion| replace:: 3.1.3 +.. |Scikit-imageMinVersion| replace:: 0.16.2 +.. |PandasMinVersion| replace:: 1.0.5 +.. |SeabornMinVersion| replace:: 0.9.0 +.. |Memory_profilerMinVersion| replace:: 0.57.0 +.. |PytestMinVersion| replace:: 7.1.2 +.. |Pytest-covMinVersion| replace:: 2.9.0 +.. |RuffMinVersion| replace:: 0.0.272 +.. |BlackMinVersion| replace:: 23.3.0 +.. |MypyMinVersion| replace:: 1.3 +.. |PyamgMinVersion| replace:: 4.0.0 +.. |PolarsMinVersion| replace:: 0.18.2 +.. |PyarrowMinVersion| replace:: 12.0.0 +.. |SphinxMinVersion| replace:: 6.0.0 +.. |Sphinx-copybuttonMinVersion| replace:: 0.5.2 +.. |Sphinx-galleryMinVersion| replace:: 0.10.1 +.. |NumpydocMinVersion| replace:: 1.2.0 +.. |PillowMinVersion| replace:: 7.1.2 +.. |PoochMinVersion| replace:: 1.6.0 +.. |Sphinx-promptMinVersion| replace:: 1.3.0 +.. |Sphinxext-opengraphMinVersion| replace:: 0.4.2 +.. |PlotlyMinVersion| replace:: 5.14.0 +.. |Conda-lockMinVersion| replace:: 2.1.1 diff --git a/min_dependency_table.rst b/min_dependency_table.rst new file mode 100644 index 0000000000000..3a223a0fef797 --- /dev/null +++ b/min_dependency_table.rst @@ -0,0 +1,32 @@ +======================= =================== ==================================== +Dependency Minimum Version Purpose +======================= =================== ==================================== +numpy 1.17.3 build, install +scipy 1.5.0 build, install +joblib 1.1.1 install +threadpoolctl 2.0.0 install +cython 0.29.33 build +matplotlib 3.1.3 benchmark, docs, examples, tests +scikit-image 0.16.2 docs, examples, tests +pandas 1.0.5 benchmark, docs, examples, tests +seaborn 0.9.0 docs, examples +memory_profiler 0.57.0 benchmark, docs +pytest 7.1.2 tests +pytest-cov 2.9.0 tests +ruff 0.0.272 tests +black 23.3.0 tests +mypy 1.3 tests +pyamg 4.0.0 tests +polars 0.18.2 tests +pyarrow 12.0.0 tests +sphinx 6.0.0 docs +sphinx-copybutton 0.5.2 docs +sphinx-gallery 0.10.1 docs +numpydoc 1.2.0 docs, tests +Pillow 7.1.2 docs +pooch 1.6.0 docs, examples, tests +sphinx-prompt 1.3.0 docs +sphinxext-opengraph 0.4.2 docs +plotly 5.14.0 docs, examples +conda-lock 2.1.1 maintenance +======================= =================== ==================================== diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 4cad13b6c7658..66af09d79f203 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -2708,7 +2708,7 @@ class ExtraTreesRegressor(ForestRegressor): Whether to store the leaf values in the ``get_leaf_node_samples`` function. **Experimental feature** - + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing From 8b5d0f9e5d5b13016c5a3444f632469e8c799f7a Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 20 Jul 2023 14:13:49 -0400 Subject: [PATCH 35/39] Fix unit test Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 1b718f3a04052..fa68f18921636 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -724,9 +724,14 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest", check_input=True for idx, leaf_id in enumerate(X_leaves): # predict by taking the quantile across the samples in the leaf for # each output - proba[idx, ...] = np.quantile( - leaf_samples[leaf_id], quantiles, axis=0, interpolation=method - ) + try: + proba[idx, ...] = np.quantile( + leaf_samples[leaf_id], quantiles, axis=0, method=method + ) + except TypeError: + proba[idx, ...] = np.quantile( + leaf_samples[leaf_id], quantiles, axis=0, interpolation=method + ) # Classification if is_classifier(self): From 38bade77502b16a203cf6448d9badabcef5d69fa Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 20 Jul 2023 16:37:52 -0400 Subject: [PATCH 36/39] Fix lint Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 38 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 66af09d79f203..4e33fc07d85cc 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1533,12 +1533,12 @@ class RandomForestClassifier(ForestClassifier): max_bins : int, default=255 The maximum number of bins to use for non-missing values. - **Experimental feature** + **This is an experimental feature**. store_leaf_values : bool, default=False Whether to store the leaf values in the ``get_leaf_node_samples`` function. - **Experimental feature** + **This is an experimental feature**. monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. @@ -1925,12 +1925,12 @@ class RandomForestRegressor(ForestRegressor): The maximum number of bins to use for non-missing values. Used for speeding up training time. - **Experimental feature** + **This is an experimental feature**. store_leaf_values : bool, default=False Whether to store the leaf values in the ``get_leaf_node_samples`` function. - **Experimental feature** + **This is an experimental feature**. monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. @@ -2307,31 +2307,12 @@ class ExtraTreesClassifier(ForestClassifier): max_bins : int, default=255 The maximum number of bins to use for non-missing values. - **Experimental feature** + **This is an experimental feature**. store_leaf_values : bool, default=False Whether to store the leaf values in the ``get_leaf_node_samples`` function. - **Experimental feature** - - monotonic_cst : array-like of int of shape (n_features), default=None - Indicates the monotonicity constraint to enforce on each feature. - - 1: monotonically increasing - - 0: no constraint - - -1: monotonically decreasing - - If monotonic_cst is None, no constraints are applied. - - Monotonicity constraints are not supported for: - - multiclass classifications (i.e. when `n_classes > 2`), - - multioutput classifications (i.e. when `n_outputs_ > 1`), - - classifications trained on data with missing values. - - The constraints hold over the probability of the positive class. - - Read more in the :ref:`User Guide `. - - .. versionadded:: 1.4 + **This is an experimental feature**. monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. @@ -2702,12 +2683,12 @@ class ExtraTreesRegressor(ForestRegressor): max_bins : int, default=255 The maximum number of bins to use for non-missing values. - **Experimental feature** + **This is an experimental feature**. store_leaf_values : bool, default=False Whether to store the leaf values in the ``get_leaf_node_samples`` function. - **Experimental feature** + **This is an experimental feature**. monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. @@ -2990,6 +2971,9 @@ class RandomTreesEmbedding(TransformerMixin, BaseForest): new forest. See :term:`Glossary ` and :ref:`gradient_boosting_warm_start` for details. + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance From feffdeb35834a9258348dec525820b615dd03fd0 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 20 Jul 2023 19:45:39 -0400 Subject: [PATCH 37/39] Adding fix Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index fa68f18921636..091c7e9b4c002 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -416,9 +416,6 @@ def _fit( random_state, ) - if self.store_leaf_values: - self.leaf_nodes_samples_ = self.tree_.leaf_nodes_samples - return self def _build_tree( @@ -1118,9 +1115,6 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. - leaf_nodes_samples_ : dict - A dictionary of leaf node index and the y_train samples in that leaf. - See Also -------- DecisionTreeRegressor : A decision tree regressor. From 2bb5f1c13e08e811bf6868e6cd7d44d7337e8cd4 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 21 Jul 2023 11:46:50 -0400 Subject: [PATCH 38/39] Fixed Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 2 +- sklearn/tree/_classes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 4e33fc07d85cc..f4a574c62c5e9 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -778,7 +778,7 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"): # (n_total_leaf_samples, n_outputs) leaf_node_samples = np.vstack( [ - est.leaf_nodes_samples_[leaf_nodes[jdx]] + est.tree_.leaf_nodes_samples[leaf_nodes[jdx]] for jdx, est in enumerate(self.estimators_) ] ) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 091c7e9b4c002..7482fd6022e50 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -651,7 +651,7 @@ def get_leaf_node_samples(self, X, check_input=True): Returns ------- - leaf_nodes_samples : a list of array-like + leaf_nodes_samples : a list of array-like of length (n_samples,) Each sample is represented by the indices of the training samples that reached the leaf node. The ``n_leaf_node_samples`` may vary between samples, since the number of samples that fall in a leaf node is From 6ec023b366065100b8e61261f139cb57f587bbec Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 11 Aug 2023 10:33:48 -0400 Subject: [PATCH 39/39] [MERGE] Merge changes from sklearn main (#52) Merging latest changes from sklearn main #### What does this implement/fix? Explain your changes. #### Any other comments? --------- Signed-off-by: Adam Li --- .cirrus.star | 12 +- .github/workflows/wheels.yml | 12 + .gitignore | 3 + azure-pipelines.yml | 1 - build_tools/azure/posix-docker.yml | 1 - build_tools/azure/posix.yml | 1 - build_tools/azure/test_script.sh | 6 +- build_tools/cirrus/arm_tests.yml | 8 +- build_tools/cirrus/arm_wheel.yml | 12 +- build_tools/cirrus/build_test_arm.sh | 2 +- .../update_environments_and_lock_files.py | 6 +- doc/developers/contributing.rst | 1 + doc/glossary.rst | 23 + doc/install.rst | 12 +- doc/modules/array_api.rst | 19 +- doc/modules/compose.rst | 6 +- doc/modules/cross_validation.rst | 1 + doc/modules/ensemble.rst | 6 +- doc/modules/linear_model.rst | 12 +- doc/modules/neighbors.rst | 8 +- doc/modules/preprocessing.rst | 16 +- doc/modules/svm.rst | 34 +- doc/modules/tree.rst | 4 +- doc/related_projects.rst | 95 ++-- .../scikit-learn-modern/static/css/theme.css | 8 +- .../machine_learning_map/pyparsing.py | 2 +- doc/whats_new/v0.22.rst | 2 +- doc/whats_new/v1.3.rst | 36 ++ doc/whats_new/v1.4.rst | 96 ++++ .../plot_classifier_comparison.py | 16 +- ...ot_forest_hist_grad_boosting_comparison.py | 4 +- .../plot_select_from_model_diabetes.py | 54 +- .../miscellaneous/plot_metadata_routing.py | 8 +- examples/neighbors/plot_classification.py | 107 ++-- .../plot_target_encoder_cross_val.py | 137 +++-- .../plot_release_highlights_0_23_0.py | 2 +- .../plot_release_highlights_0_24_0.py | 2 +- .../plot_release_highlights_1_0_0.py | 2 +- .../plot_release_highlights_1_1_0.py | 2 +- .../plot_release_highlights_1_2_0.py | 2 +- .../plot_release_highlights_1_3_0.py | 2 +- pyproject.toml | 2 +- setup.cfg | 4 +- setup.py | 18 +- sklearn/_loss/loss.py | 12 +- sklearn/calibration.py | 14 +- sklearn/cluster/_dbscan.py | 12 +- sklearn/cluster/_hdbscan/hdbscan.py | 60 ++- sklearn/cluster/tests/test_hdbscan.py | 69 ++- sklearn/cross_decomposition/_pls.py | 7 +- sklearn/cross_decomposition/tests/test_pls.py | 23 + sklearn/ensemble/_forest.py | 57 ++- sklearn/ensemble/_gb.py | 48 +- .../gradient_boosting.py | 16 +- sklearn/ensemble/_stacking.py | 2 +- sklearn/ensemble/tests/test_forest.py | 91 ++++ sklearn/feature_selection/_rfe.py | 7 +- sklearn/impute/_knn.py | 7 +- sklearn/impute/tests/test_common.py | 36 ++ sklearn/linear_model/_logistic.py | 22 +- sklearn/metrics/_dist_metrics.pxd.tp | 40 +- sklearn/metrics/_dist_metrics.pyx.tp | 481 +++++++++++------- .../_argkmin.pyx.tp | 2 +- .../_argkmin_classmode.pyx.tp | 34 +- .../_classmode.pxd | 5 + .../_datasets_pair.pxd.tp | 14 +- .../_datasets_pair.pyx.tp | 35 +- .../_dispatcher.py | 34 +- sklearn/metrics/_scorer.py | 9 +- sklearn/metrics/tests/test_classification.py | 8 +- sklearn/metrics/tests/test_dist_metrics.py | 32 +- .../test_pairwise_distances_reduction.py | 48 +- sklearn/metrics/tests/test_score_objects.py | 20 +- sklearn/mixture/_gaussian_mixture.py | 95 +++- .../mixture/tests/test_gaussian_mixture.py | 85 ++++ sklearn/model_selection/_search.py | 2 + .../_search_successive_halving.py | 8 +- sklearn/model_selection/_split.py | 2 +- sklearn/model_selection/_validation.py | 303 +++++++++-- sklearn/model_selection/tests/test_search.py | 39 +- sklearn/model_selection/tests/test_split.py | 80 ++- .../tests/test_successive_halving.py | 73 ++- .../model_selection/tests/test_validation.py | 230 ++++++++- sklearn/multioutput.py | 16 +- sklearn/neighbors/_ball_tree.pyx | 195 ------- sklearn/neighbors/_ball_tree.pyx.tp | 284 +++++++++++ sklearn/neighbors/_base.py | 20 +- .../{_binary_tree.pxi => _binary_tree.pxi.tp} | 382 ++++++++------ sklearn/neighbors/_classification.py | 4 +- .../{_kd_tree.pyx => _kd_tree.pyx.tp} | 173 +++++-- sklearn/neighbors/_partition_nodes.pxd | 3 +- sklearn/neighbors/_partition_nodes.pyx | 4 +- sklearn/neighbors/_regression.py | 7 +- sklearn/neighbors/tests/test_ball_tree.py | 131 ++++- sklearn/neighbors/tests/test_kd_tree.py | 80 ++- sklearn/neighbors/tests/test_neighbors.py | 114 ++++- .../neighbors/tests/test_neighbors_tree.py | 4 +- sklearn/pipeline.py | 22 +- sklearn/preprocessing/_encoders.py | 25 +- sklearn/preprocessing/_label.py | 18 +- sklearn/preprocessing/_target_encoder.py | 34 +- sklearn/preprocessing/tests/test_encoders.py | 20 + .../tests/test_function_transformer.py | 5 +- sklearn/preprocessing/tests/test_label.py | 14 + sklearn/svm/_base.py | 26 +- sklearn/svm/_classes.py | 74 ++- sklearn/tests/metadata_routing_common.py | 407 +++++++++++++++ sklearn/tests/test_metadata_routing.py | 383 ++++---------- .../test_metaestimators_metadata_routing.py | 200 +------- sklearn/tests/test_pipeline.py | 29 +- sklearn/tree/_classes.py | 8 +- sklearn/tree/_export.py | 4 +- sklearn/utils/__init__.py | 2 +- sklearn/utils/_array_api.py | 3 + sklearn/utils/_encode.py | 2 +- sklearn/utils/_estimator_html_repr.py | 66 ++- sklearn/utils/_metadata_requests.py | 70 ++- sklearn/utils/_set_output.py | 4 +- sklearn/utils/_testing.py | 28 +- sklearn/utils/estimator_checks.py | 13 +- sklearn/utils/fixes.py | 8 + sklearn/utils/multiclass.py | 9 +- .../utils/tests/test_estimator_html_repr.py | 3 + sklearn/utils/tests/test_pprint.py | 2 +- sklearn/utils/tests/test_set_output.py | 29 ++ 125 files changed, 4085 insertions(+), 1809 deletions(-) create mode 100644 sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd delete mode 100644 sklearn/neighbors/_ball_tree.pyx create mode 100644 sklearn/neighbors/_ball_tree.pyx.tp rename sklearn/neighbors/{_binary_tree.pxi => _binary_tree.pxi.tp} (90%) rename sklearn/neighbors/{_kd_tree.pyx => _kd_tree.pyx.tp} (65%) create mode 100644 sklearn/tests/metadata_routing_common.py diff --git a/.cirrus.star b/.cirrus.star index 2dd1e50144987..7a432556c1299 100644 --- a/.cirrus.star +++ b/.cirrus.star @@ -14,7 +14,7 @@ def main(ctx): # Nightly jobs always run if env.get("CIRRUS_CRON", "") == "nightly": - return fs.read(arm_wheel_yaml) + return fs.read(arm_wheel_yaml) + fs.read(arm_tests_yaml) # Get commit message for event. We can not use `git` here because there is # no command line access in starlark. Thus we need to query the GitHub API @@ -26,10 +26,12 @@ def main(ctx): response = http.get(url).json() commit_msg = response["message"] - if "[skip ci]" in commit_msg: - return [] + jobs_to_run = "" if "[cd build]" in commit_msg or "[cd build cirrus]" in commit_msg: - return fs.read(arm_wheel_yaml) + fs.read(arm_tests_yaml) + jobs_to_run += fs.read(arm_wheel_yaml) + + if "[cirrus arm]" in commit_msg: + jobs_to_run += fs.read(arm_tests_yaml) - return fs.read(arm_tests_yaml) + return jobs_to_run diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 4ab75fd361586..4300db6c5e208 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -103,6 +103,18 @@ jobs: python: 311 platform_id: macosx_x86_64 + # MacOS arm64 + # The latest Python version is built and tested on CirrusCI + - os: macos-latest + python: 38 + platform_id: macosx_arm64 + - os: macos-latest + python: 39 + platform_id: macosx_arm64 + - os: macos-latest + python: 310 + platform_id: macosx_arm64 + steps: - name: Checkout scikit-learn uses: actions/checkout@v3 diff --git a/.gitignore b/.gitignore index 5296f46280e4d..cfc13d4997b4b 100644 --- a/.gitignore +++ b/.gitignore @@ -100,6 +100,9 @@ sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx +sklearn/neighbors/_ball_tree.pyx +sklearn/neighbors/_binary_tree.pxi +sklearn/neighbors/_kd_tree.pyx # Default JupyterLite content jupyterlite_contents diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 6c3511319e4eb..464096fb69c29 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -171,7 +171,6 @@ jobs: DISTRIB: 'conda' LOCK_FILE: './build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock' COVERAGE: 'true' - SHOW_SHORT_SUMMARY: 'true' SKLEARN_TESTS_GLOBAL_RANDOM_SEED: '42' # default global random seed # Check compilation with Ubuntu 22.04 LTS (Jammy Jellyfish) and scipy from conda-forge diff --git a/build_tools/azure/posix-docker.yml b/build_tools/azure/posix-docker.yml index af776c4c62f14..b00ca66c378ca 100644 --- a/build_tools/azure/posix-docker.yml +++ b/build_tools/azure/posix-docker.yml @@ -22,7 +22,6 @@ jobs: # Set in azure-pipelines.yml DISTRIB: '' DOCKER_CONTAINER: '' - SHOW_SHORT_SUMMARY: 'false' CREATE_ISSUE_ON_TRACKER: 'true' CCACHE_DIR: $(Pipeline.Workspace)/ccache CCACHE_COMPRESS: '1' diff --git a/build_tools/azure/posix.yml b/build_tools/azure/posix.yml index 2ee03daafd288..35e5165d22c83 100644 --- a/build_tools/azure/posix.yml +++ b/build_tools/azure/posix.yml @@ -22,7 +22,6 @@ jobs: PYTEST_XDIST_VERSION: 'latest' COVERAGE: 'true' CREATE_ISSUE_ON_TRACKER: 'true' - SHOW_SHORT_SUMMARY: 'false' strategy: matrix: ${{ insert }}: ${{ parameters.matrix }} diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index 98ac2e797b73c..5117473ea6366 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -49,7 +49,7 @@ if [[ "$COVERAGE" == "true" ]]; then fi if [[ -n "$CHECK_WARNINGS" ]]; then - TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning -Werror::numpy.VisibleDeprecationWarning" + TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning -Werror::sklearn.utils.fixes.VisibleDeprecationWarning" # numpy's 1.19.0's tostring() deprecation is ignored until scipy and joblib # removes its usage @@ -75,10 +75,6 @@ if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then TEST_CMD="$TEST_CMD -n$XDIST_WORKERS" fi -if [[ "$SHOW_SHORT_SUMMARY" == "true" ]]; then - TEST_CMD="$TEST_CMD -ra" -fi - if [[ -n "$SELECTED_TESTS" ]]; then TEST_CMD="$TEST_CMD -k $SELECTED_TESTS" diff --git a/build_tools/cirrus/arm_tests.yml b/build_tools/cirrus/arm_tests.yml index a6e5919ecc32f..d1ac551a749e3 100644 --- a/build_tools/cirrus/arm_tests.yml +++ b/build_tools/cirrus/arm_tests.yml @@ -17,4 +17,10 @@ linux_aarch64_test_task: folder: /root/.conda/pkgs fingerprint_script: cat build_tools/cirrus/py39_conda_forge_linux-aarch64_conda.lock - test_script: bash build_tools/cirrus/build_test_arm.sh + test_script: | + bash build_tools/cirrus/build_test_arm.sh + # On success, this script is run updating the issue. + bash build_tools/cirrus/update_tracking_issue.sh true + + on_failure: + update_tracker_script: bash build_tools/cirrus/update_tracking_issue.sh false diff --git a/build_tools/cirrus/arm_wheel.yml b/build_tools/cirrus/arm_wheel.yml index a7023867e1109..5616108315fba 100644 --- a/build_tools/cirrus/arm_wheel.yml +++ b/build_tools/cirrus/arm_wheel.yml @@ -16,12 +16,8 @@ macos_arm64_wheel_task: # See `maint_tools/update_tracking_issue.py` for details on the permissions the token requires. BOT_GITHUB_TOKEN: ENCRYPTED[9b50205e2693f9e4ce9a3f0fcb897a259289062fda2f5a3b8aaa6c56d839e0854a15872f894a70fca337dd4787274e0f] matrix: - - env: - CIBW_BUILD: cp38-macosx_arm64 - - env: - CIBW_BUILD: cp39-macosx_arm64 - - env: - CIBW_BUILD: cp310-macosx_arm64 + # Only the latest Python version is built and tested on CirrusCI, the other + # macos arm64 builds are on GitHub Actions - env: CIBW_BUILD: cp311-macosx_arm64 @@ -60,12 +56,16 @@ linux_arm64_wheel_task: # See `maint_tools/update_tracking_issue.py` for details on the permissions the token requires. BOT_GITHUB_TOKEN: ENCRYPTED[9b50205e2693f9e4ce9a3f0fcb897a259289062fda2f5a3b8aaa6c56d839e0854a15872f894a70fca337dd4787274e0f] matrix: + # Only the latest Python version is tested - env: CIBW_BUILD: cp38-manylinux_aarch64 + CIBW_TEST_SKIP: "*_aarch64" - env: CIBW_BUILD: cp39-manylinux_aarch64 + CIBW_TEST_SKIP: "*_aarch64" - env: CIBW_BUILD: cp310-manylinux_aarch64 + CIBW_TEST_SKIP: "*_aarch64" - env: CIBW_BUILD: cp311-manylinux_aarch64 diff --git a/build_tools/cirrus/build_test_arm.sh b/build_tools/cirrus/build_test_arm.sh index 4eeef6ec2dc0c..dfe048da47a7f 100755 --- a/build_tools/cirrus/build_test_arm.sh +++ b/build_tools/cirrus/build_test_arm.sh @@ -25,7 +25,7 @@ setup_ccache() { MAMBAFORGE_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-aarch64.sh" # Install Mambaforge -wget $MAMBAFORGE_URL -O mambaforge.sh +curl -L $MAMBAFORGE_URL -o mambaforge.sh MAMBAFORGE_PATH=$HOME/mambaforge bash ./mambaforge.sh -b -p $MAMBAFORGE_PATH export PATH=$MAMBAFORGE_PATH/bin:$PATH diff --git a/build_tools/update_environments_and_lock_files.py b/build_tools/update_environments_and_lock_files.py index 4854cc7936aca..35c382bd7f5ab 100644 --- a/build_tools/update_environments_and_lock_files.py +++ b/build_tools/update_environments_and_lock_files.py @@ -556,15 +556,15 @@ def check_conda_version(): # Avoid issues with glibc (https://github.com/conda/conda-lock/issues/292) # or osx (https://github.com/conda/conda-lock/issues/408) virtual package. # The glibc one has been fixed in conda 23.1.0 and the osx has been fixed - # in main and will be fixed when conda >= 23.6 is released. + # in conda 23.7.0. conda_info_output = execute_command(["conda", "info", "--json"]) conda_info = json.loads(conda_info_output) conda_version = Version(conda_info["conda_version"]) - if Version("22.9.0") < conda_version < Version("23.6"): + if Version("22.9.0") < conda_version < Version("23.7"): raise RuntimeError( - f"conda version should be <= 22.9.0 or >= 23.6 got: {conda_version}" + f"conda version should be <= 22.9.0 or >= 23.7 got: {conda_version}" ) diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index fc1ef95dbced0..6aecc524a9a30 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -542,6 +542,7 @@ message, the following actions are taken. [pypy] Build & test with PyPy [pyodide] Build & test with Pyodide [azure parallel] Run Azure CI jobs in parallel + [cirrus arm] Run Cirrus CI ARM test [float32] Run float32 tests by setting `SKLEARN_RUN_FLOAT32_TESTS=1`. See :ref:`environment_variable` for more details [doc skip] Docs are not built [doc quick] Docs built, but excludes example gallery plots diff --git a/doc/glossary.rst b/doc/glossary.rst index 36afcd9483684..1dbb7e630c449 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -205,6 +205,29 @@ General Concepts exceptional behaviours on the estimator using semantic :term:`estimator tags`. + cross-fitting + cross fitting + A resampling method that iteratively partitions data into mutually + exclusive subsets to fit two stages. During the first stage, the + mutually exclusive subsets enable predictions or transformations to be + computed on data not seen during training. The computed data is then + used in the second stage. The objective is to avoid having any + overfitting in the first stage introduce bias into the input data + distribution of the second stage. + For examples of its use, see: :class:`~preprocessing.TargetEncoder`, + :class:`~ensemble.StackingClassifier`, + :class:`~ensemble.StackingRegressor` and + :class:`~calibration.CalibratedClassifierCV`. + + cross-validation + cross validation + A resampling method that iteratively partitions data into mutually + exclusive 'train' and 'test' subsets so model performance can be + evaluated on unseen data. This conserves data as avoids the need to hold + out a 'validation' dataset and accounts for variability as multiple + rounds of cross validation are generally performed. + See :ref:`User Guide ` for more details. + deprecation We use deprecation to slowly violate our :term:`backwards compatibility` assurances, usually to: diff --git a/doc/install.rst b/doc/install.rst index bf2832bf72f24..263e83cdc31a5 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -61,7 +61,7 @@ Installing the latest release >Install python3 and python3-pip using the package manager of the Linux Distribution.Install conda using the Anaconda or miniconda - installers or the miniforge installers + installers or the miniforge installers (no administrator permission required for any of those). @@ -279,14 +279,14 @@ and in the `main`, `conda-forge` and `intel` conda channels: conda install scikit-learn-intelex -This package has an Intel optimized version of many estimators. Whenever -an alternative implementation doesn't exist, scikit-learn implementation -is used as a fallback. Those optimized solvers come from the oneDAL -C++ library and are optimized for the x86_64 architecture, and are +This package has an Intel optimized version of many estimators. Whenever +an alternative implementation doesn't exist, scikit-learn implementation +is used as a fallback. Those optimized solvers come from the oneDAL +C++ library and are optimized for the x86_64 architecture, and are optimized for multi-core Intel CPUs. Note that those solvers are not enabled by default, please refer to the -`scikit-learn-intelex `_ +`scikit-learn-intelex `_ documentation for more details on usage scenarios. Direct export example: .. prompt:: bash $ diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst index 635395fd07c43..741ebbf240a6d 100644 --- a/doc/modules/array_api.rst +++ b/doc/modules/array_api.rst @@ -83,17 +83,26 @@ the tensors directly:: >>> X_trans.device.type 'cuda' -.. _array_api_estimators: +.. _array_api_supported: -Estimators with support for `Array API`-compatible inputs -========================================================= +Support for `Array API`-compatible inputs +========================================= + +Estimators and other tools in scikit-learn that support Array API compatible inputs. + +Estimators +---------- - :class:`decomposition.PCA` (with `svd_solver="full"`, `svd_solver="randomized"` and `power_iteration_normalizer="QR"`) - :class:`discriminant_analysis.LinearDiscriminantAnalysis` (with `solver="svd"`) -Coverage for more estimators is expected to grow over time. Please follow the -dedicated `meta-issue on GitHub +Tools +----- + +- :func:`model_selection.train_test_split` + +Coverage is expected to grow over time. Please follow the dedicated `meta-issue on GitHub `_ to track progress. Common estimator checks diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index faba9a76ab94c..f277c32675c3f 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -66,10 +66,8 @@ it takes a variable number of estimators and returns a pipeline, filling in the names automatically:: >>> from sklearn.pipeline import make_pipeline - >>> from sklearn.naive_bayes import MultinomialNB - >>> from sklearn.preprocessing import Binarizer - >>> make_pipeline(Binarizer(), MultinomialNB()) - Pipeline(steps=[('binarizer', Binarizer()), ('multinomialnb', MultinomialNB())]) + >>> make_pipeline(PCA(), SVC()) + Pipeline(steps=[('pca', PCA()), ('svc', SVC())]) Accessing steps ............... diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index 6158e000cb727..8afa467982736 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -102,6 +102,7 @@ where the number of samples is very small. .. image:: ../images/grid_search_cross_validation.png :width: 500px :height: 300px + :alt: A depiction of a 5 fold cross validation on a training set, while holding out a test set. :align: center Computing cross-validated metrics diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index c3ea63bc6e944..36eed98da0f6b 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -10,12 +10,12 @@ Ensembles: Gradient boosting, random forests, bagging, voting, stacking base estimators built with a given learning algorithm in order to improve generalizability / robustness over a single estimator. -Two very famous examples of ensemble methods are `gradient-boosted trees -`_ and `random forests `_. +Two very famous examples of ensemble methods are :ref:`gradient-boosted trees +` and :ref:`random forests `. More generally, ensemble models can be applied to any base learner beyond trees, in averaging methods such as :ref:`Bagging methods `, -`model stacking `_, or `Voting `_, or in +:ref:`model stacking `, or :ref:`Voting `, or in boosting, as :ref:`AdaBoost `. .. contents:: diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 43356763d69c3..aa9184a2bedc5 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -37,7 +37,7 @@ solves a problem of the form: :align: center :scale: 50% -:class:`LinearRegression` will take in its ``fit`` method arrays X, y +:class:`LinearRegression` will take in its ``fit`` method arrays ``X``, ``y`` and will store the coefficients :math:`w` of the linear model in its ``coef_`` member:: @@ -114,7 +114,7 @@ of shrinkage and thus the coefficients become more robust to collinearity. As with other linear models, :class:`Ridge` will take in its ``fit`` method -arrays X, y and will store the coefficients :math:`w` of the linear model in +arrays ``X``, ``y`` and will store the coefficients :math:`w` of the linear model in its ``coef_`` member:: >>> from sklearn import linear_model @@ -889,12 +889,16 @@ the probability of the positive class :math:`P(y_i=1|X_i)` as .. math:: \hat{p}(X_i) = \operatorname{expit}(X_i w + w_0) = \frac{1}{1 + \exp(-X_i w - w_0)}. + As an optimization problem, binary class logistic regression with regularization term :math:`r(w)` minimizes the following cost function: -.. math:: \min_{w} C \sum_{i=1}^n \left(-y_i \log(\hat{p}(X_i)) - (1 - y_i) \log(1 - \hat{p}(X_i))\right) + r(w). - +.. math:: + :name: regularized-logistic-loss + + \min_{w} C \sum_{i=1}^n \left(-y_i \log(\hat{p}(X_i)) - (1 - y_i) \log(1 - \hat{p}(X_i))\right) + r(w). + We currently provide four choices for the regularization term :math:`r(w)` via the `penalty` argument: diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst index d11287e7c29b1..d3a7df74e6348 100644 --- a/doc/modules/neighbors.rst +++ b/doc/modules/neighbors.rst @@ -188,13 +188,9 @@ distance can be supplied to compute the weights. .. |classification_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_classification_001.png :target: ../auto_examples/neighbors/plot_classification.html - :scale: 50 - -.. |classification_2| image:: ../auto_examples/neighbors/images/sphx_glr_plot_classification_002.png - :target: ../auto_examples/neighbors/plot_classification.html - :scale: 50 + :scale: 75 -.. centered:: |classification_1| |classification_2| +.. centered:: |classification_1| .. topic:: Examples: diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 1d7ad07f7023c..82fecf0c4e9f1 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -910,16 +910,16 @@ For continuous targets, the formulation is similar to binary classification: where :math:`L_i` is the set of observations with category :math:`i` and :math:`n_i` is the number of observations with category :math:`i`. -:meth:`~TargetEncoder.fit_transform` internally relies on a cross fitting +:meth:`~TargetEncoder.fit_transform` internally relies on a :term:`cross fitting` scheme to prevent target information from leaking into the train-time representation, especially for non-informative high-cardinality categorical variables, and help prevent the downstream model from overfitting spurious correlations. Note that as a result, `fit(X, y).transform(X)` does not equal `fit_transform(X, y)`. In :meth:`~TargetEncoder.fit_transform`, the training -data is split into *k* folds (determined by the `cv` parameter) and encodes each -fold using the encodings trained on the other *k-1* folds. The following diagram -shows the cross fitting scheme in :meth:`~TargetEncoder.fit_transform` with -the default `cv=5`: +data is split into *k* folds (determined by the `cv` parameter) and each fold is +encoded using the encodings learnt using the other *k-1* folds. The following +diagram shows the :term:`cross fitting` scheme in +:meth:`~TargetEncoder.fit_transform` with the default `cv=5`: .. image:: ../images/target_encoder_cross_validation.svg :width: 600 @@ -929,10 +929,10 @@ the default `cv=5`: the whole training set. This is never used in :meth:`~TargetEncoder.fit_transform` but is saved to the attribute `encodings_`, for use when :meth:`~TargetEncoder.transform` is called. Note that the encodings -learned for each fold during the cross fitting scheme are not saved to an -attribute. +learned for each fold during the :term:`cross fitting` scheme are not saved to +an attribute. -The :meth:`~TargetEncoder.fit` method does **not** use any cross fitting +The :meth:`~TargetEncoder.fit` method does **not** use any :term:`cross fitting` schemes and learns one encoding on the entire training set, which is used to encode categories in :meth:`~TargetEncoder.transform`. This encoding is the same as the 'full data' diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst index 7e886366aebae..0ac34cdcb6a10 100644 --- a/doc/modules/svm.rst +++ b/doc/modules/svm.rst @@ -60,14 +60,19 @@ capable of performing binary and multi-class classification on a dataset. :align: center -:class:`SVC` and :class:`NuSVC` are similar methods, but accept -slightly different sets of parameters and have different mathematical -formulations (see section :ref:`svm_mathematical_formulation`). On the -other hand, :class:`LinearSVC` is another (faster) implementation of Support -Vector Classification for the case of a linear kernel. Note that -:class:`LinearSVC` does not accept parameter ``kernel``, as this is -assumed to be linear. It also lacks some of the attributes of -:class:`SVC` and :class:`NuSVC`, like ``support_``. +:class:`SVC` and :class:`NuSVC` are similar methods, but accept slightly +different sets of parameters and have different mathematical formulations (see +section :ref:`svm_mathematical_formulation`). On the other hand, +:class:`LinearSVC` is another (faster) implementation of Support Vector +Classification for the case of a linear kernel. It also +lacks some of the attributes of :class:`SVC` and :class:`NuSVC`, like +`support_`. :class:`LinearSVC` uses `squared_hinge` loss and due to its +implementation in `liblinear` it also regularizes the intercept, if considered. +This effect can however be reduced by carefully fine tuning its +`intercept_scaling` parameter, which allows the intercept term to have a +different regularization behavior compared to the other features. The +classification results and score can therefore differ from the other two +classifiers. As other classifiers, :class:`SVC`, :class:`NuSVC` and :class:`LinearSVC` take as input two arrays: an array `X` of shape @@ -314,10 +319,15 @@ target. There are three different implementations of Support Vector Regression: :class:`SVR`, :class:`NuSVR` and :class:`LinearSVR`. :class:`LinearSVR` -provides a faster implementation than :class:`SVR` but only considers -the linear kernel, while :class:`NuSVR` implements a slightly different -formulation than :class:`SVR` and :class:`LinearSVR`. See -:ref:`svm_implementation_details` for further details. +provides a faster implementation than :class:`SVR` but only considers the +linear kernel, while :class:`NuSVR` implements a slightly different formulation +than :class:`SVR` and :class:`LinearSVR`. Due to its implementation in +`liblinear` :class:`LinearSVR` also regularizes the intercept, if considered. +This effect can however be reduced by carefully fine tuning its +`intercept_scaling` parameter, which allows the intercept term to have a +different regularization behavior compared to the other features. The +classification results and score can therefore differ from the other two +classifiers. See :ref:`svm_implementation_details` for further details. As with classification classes, the fit method will take as argument vectors X, y, only that in this case y is expected to have diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst index ae82af1366966..7ae039e64a49a 100644 --- a/doc/modules/tree.rst +++ b/doc/modules/tree.rst @@ -27,8 +27,8 @@ Some advantages of decision trees are: - Requires little data preparation. Other techniques often require data normalization, dummy variables need to be created and blank values to - be removed. Note however that this module does not support missing - values. + be removed. Some tree and algorithm combinations support + :ref:`missing values `. - The cost of using the tree (i.e., predicting data) is logarithmic in the number of data points used to train the tree. diff --git a/doc/related_projects.rst b/doc/related_projects.rst index 9cc70ad89ffff..10304a7070be0 100644 --- a/doc/related_projects.rst +++ b/doc/related_projects.rst @@ -21,9 +21,6 @@ enhance the functionality of scikit-learn's estimators. **Data formats** -- `Fast svmlight / libsvm file loader `_ - Fast and memory-efficient svmlight / libsvm file loader for Python. - - `sklearn_pandas `_ bridge for scikit-learn pipelines and pandas data frame with dedicated transformers. @@ -64,19 +61,20 @@ enhance the functionality of scikit-learn's estimators. It incorporates multiple modeling libraries under one API, and the objects that EvalML creates use an sklearn-compatible API. -**Experimentation frameworks** +**Experimentation and model registry frameworks** + +- `MLFlow `_ MLflow is an open source platform to manage the ML + lifecycle, including experimentation, reproducibility, deployment, and a central + model registry. - `Neptune `_ Metadata store for MLOps, - built for teams that run a lot of experiments.‌ It gives you a single + built for teams that run a lot of experiments. It gives you a single place to log, store, display, organize, compare, and query all your model building metadata. - `Sacred `_ Tool to help you configure, organize, log and reproduce experiments -- `REP `_ Environment for conducting data-driven - research in a consistent and reproducible way - - `Scikit-Learn Laboratory `_ A command-line wrapper around scikit-learn that makes it easy to run machine learning @@ -91,10 +89,7 @@ enhance the functionality of scikit-learn's estimators. debugging/inspecting machine learning models and explaining their predictions. -- `mlxtend `_ Includes model visualization - utilities. - -- `sklearn-evaluation `_ +- `sklearn-evaluation `_ Machine learning model evaluation made easy: plots, tables, HTML reports, experiment tracking and Jupyter notebook analysis. Visual analysis, model selection, evaluation and diagnostics. @@ -140,7 +135,15 @@ enhance the functionality of scikit-learn's estimators. - `treelite `_ Compiles tree-based ensemble models into C code for minimizing prediction latency. - + +- `micromlgen `_ + MicroML brings Machine Learning algorithms to microcontrollers. + Supports several scikit-learn classifiers by transpiling them to C code. + +- `emlearn `_ + Implements scikit-learn estimators in C99 for embedded devices and microcontrollers. + Supports several classifier, regression and outlier detection models. + **Model throughput** - `Intel(R) Extension for scikit-learn `_ @@ -161,12 +164,40 @@ project. The following are projects providing interfaces similar to scikit-learn for additional learning algorithms, infrastructures and tasks. -**Structured learning** +**Time series and forecasting** + +- `Darts `_ Darts is a Python library for + user-friendly forecasting and anomaly detection on time series. It contains a variety + of models, from classics such as ARIMA to deep neural networks. The forecasting + models can all be used in the same way, using fit() and predict() functions, similar + to scikit-learn. + +- `sktime `_ A scikit-learn compatible + toolbox for machine learning with time series including time series + classification/regression and (supervised/panel) forecasting. + +- `skforecast `_ A python library + that eases using scikit-learn regressors as multi-step forecasters. It also works + with any regressor compatible with the scikit-learn API. + +- `tslearn `_ A machine learning library for + time series that offers tools for pre-processing and feature extraction as well as + dedicated models for clustering, classification and regression. -- `tslearn `_ A machine learning library for time series - that offers tools for pre-processing and feature extraction as well as dedicated models for clustering, classification and regression. +**Gradient (tree) boosting** -- `sktime `_ A scikit-learn compatible toolbox for machine learning with time series including time series classification/regression and (supervised/panel) forecasting. +Note scikit-learn own modern gradient boosting estimators +:class:`~sklearn.ensemble.HistGradientBoostingClassifier` and +:class:`~sklearn.ensemble.HistGradientBoostingRegressor`. + +- `XGBoost `_ XGBoost is an optimized distributed + gradient boosting library designed to be highly efficient, flexible and portable. + +- `LightGBM `_ LightGBM is a gradient boosting + framework that uses tree based learning algorithms. It is designed to be distributed + and efficient. + +**Structured learning** - `HMMLearn `_ Implementation of hidden markov models that was previously part of scikit-learn. @@ -182,21 +213,9 @@ and tasks. (`CRFsuite `_ wrapper with sklearn-like API). -- `skforecast `_ A python library - that eases using scikit-learn regressors as multi-step forecasters. It also works - with any regressor compatible with the scikit-learn API. **Deep neural networks etc.** -- `nolearn `_ A number of wrappers and - abstractions around existing neural network libraries - -- `Keras `_ High-level API for - TensorFlow with a scikit-learn inspired API. - -- `lasagne `_ A lightweight library to - build and train neural networks in Theano. - - `skorch `_ A scikit-learn compatible neural network library that wraps PyTorch. @@ -219,9 +238,6 @@ and tasks. **Other regression and classification** -- `xgboost `_ Optimised gradient boosted decision - tree library. - - `ML-Ensemble `_ Generalized ensemble learning (stacking, blending, subsemble, deep ensembles, etc.). @@ -232,10 +248,6 @@ and tasks. - `py-earth `_ Multivariate adaptive regression splines -- `Kernel Regression `_ - Implementation of Nadaraya-Watson kernel regression with automatic bandwidth - selection - - `gplearn `_ Genetic Programming for symbolic regression tasks. @@ -245,8 +257,6 @@ and tasks. - `seglearn `_ Time series and sequence learning using sliding window segmentation. -- `libOPF `_ Optimal path forest classifier - - `fastFM `_ Fast factorization machine implementation compatible with scikit-learn @@ -266,6 +276,7 @@ and tasks. - `hdbscan `_ HDBSCAN and Robust Single Linkage clustering algorithms for robust variable density clustering. + As of scikit-learn version 1.3.0, there is :class:`~sklearn.cluster.HDBSCAN`. - `spherecluster `_ Spherical K-means and mixture of von Mises Fisher clustering routines for data on the @@ -276,6 +287,8 @@ and tasks. - `categorical-encoding `_ A library of sklearn compatible categorical variable encoders. + As of scikit-learn version 1.3.0, there is + :class:`~sklearn.preprocessing.TargetEncoder`. - `imbalanced-learn `_ Various @@ -331,9 +344,6 @@ Recommendation Engine packages - `OpenRec `_ TensorFlow-based neural-network inspired recommendation algorithms. -- `Spotlight `_ Pytorch-based - implementation of deep recommender models. - - `Surprise Lib `_ Library for explicit feedback datasets. @@ -355,9 +365,6 @@ Domain specific packages - `AstroML `_ Machine learning for astronomy. -- `MSMBuilder `_ Machine learning for protein - conformational dynamics time series. - Translations of scikit-learn documentation ------------------------------------------ diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css index 0a8822cdcd848..40ac5e25ea698 100644 --- a/doc/themes/scikit-learn-modern/static/css/theme.css +++ b/doc/themes/scikit-learn-modern/static/css/theme.css @@ -661,13 +661,19 @@ div.sk-sidebar-global-toc ul ul { div.sk-page-content h1 { background-color: #cde8ef; padding: 0.5rem; - margin-top: calc(max(2.5rem, 1vh)); + margin-top: calc(max(1rem, 1vh)); border-radius: 0 1rem; text-align: center; font-size: 2rem; word-wrap: break-word; } +/* General sibling selector: does not apply to first h1, to avoid gap in + * top of page */ +div.sk-page-content ~ h1 { + margin-top: calc(max(2.5rem, 1vh)); +} + div.sk-page-content h2 { padding: 0.5rem; background-color: #BED4EB; diff --git a/doc/tutorial/machine_learning_map/pyparsing.py b/doc/tutorial/machine_learning_map/pyparsing.py index 0418cf2b51528..88d00e138d02c 100644 --- a/doc/tutorial/machine_learning_map/pyparsing.py +++ b/doc/tutorial/machine_learning_map/pyparsing.py @@ -21,7 +21,7 @@ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # -# flake8: noqa +# ruff: noqa __doc__ = \ """ diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index fea27b0c1c1a4..da2f5e8796db8 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -392,7 +392,7 @@ Changelog - |Efficiency| :class:`decomposition.NMF` with `solver="mu"` fitted on sparse input matrices now uses batching to avoid briefly allocating an array with size - (#non-zero elements, n_components). :pr:`15257` by `Mart Willocx `_. + (#non-zero elements, n_components). :pr:`15257` by :user:`Mart Willocx `. - |Enhancement| :func:`decomposition.dict_learning` and :func:`decomposition.dict_learning_online` now accept `method_max_iter` and diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 8d39ca2fed143..dc955f7aa0f51 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -9,9 +9,22 @@ Version 1.3.1 **In development** +Changes impacting all modules +----------------------------- + +- |Fix| The `set_output` API correctly works with list input. :pr:`27044` by + `Thomas Fan`_. + Changelog --------- +:mod:`sklearn.impute` +..................... + +- |Fix| :class:`impute.KNNImputer` now correctly adds a missing indicator column in + ``transform`` when ``add_indicator`` is set to ``True`` and missing values are observed + during ``fit``. :pr:`26600` by :user:`Shreesha Kumar Bhat `. + :mod:`sklearn.neighbors` ........................ @@ -23,6 +36,22 @@ Changelog :attr:`sklearn.neighbors.KDTree.valid_metrics` as public class attributes. :pr:`26754` by :user:`Julien Jerphanion `. +- |Fix| :class:`sklearn.model_selection.HalvingRandomSearchCV` no longer raises + when the input to the `param_distributions` parameter is a list of dicts. + :pr:`26893` by :user:`Stefanie Senger `. + +:mod:`sklearn.preprocessing` +............................ + +- |Fix| :class:`preprocessing.LabelEncoder` correctly accepts `y` as a keyword + argument. :pr:`26940` by `Thomas Fan`_. + +:mod:`sklearn.tree` +................... + +- |Fix| :func:`tree.plot_tree` now accepts `class_names=True` as documented. + :pr:`26903` by :user:`Thomas Roehr <2maz>` + .. _changes_1_3: Version 1.3.0 @@ -596,6 +625,13 @@ Changelog `n_targets`, which is used to decide the number of outputs when sampling from the prior distributions. :pr:`23099` by :user:`Zhehao Liu `. +:mod:`sklearn.mixture` +...................... + +- |Efficiency| :class:`GaussianMixture` is more efficient now and will bypass unnecessary + initialization if the weights, means, and precisions are given by users. + :pr:`26021` by :user:`Jiawei Zhang `. + :mod:`sklearn.model_selection` .............................. diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index c2b7d19404af9..e168f1d667607 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -19,6 +19,11 @@ parameters, may produce different models from the previous version. This often occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. +- |Fix| The initialization of :class:`mixture.GaussianMixture` from user-provided + `precisions_init` for `covariance_type` of `full` or `tied` was not correct, + and has been fixed. + :pr:`26416` by :user:`Yang Tao `. + Changes impacting all modules ----------------------------- @@ -61,6 +66,27 @@ Changelog - |Enhancement| :func:`base.clone` now supports `dict` as input and creates a copy. :pr:`26786` by `Adrin Jalali`_. +- |API|:func:`~utils.metadata_routing.process_routing` now has a different + signature. The first two (the object and the method) are positional only, + and all metadata are passed as keyword arguments. :pr:`26909` by `Adrin + Jalali`_. + +:mod:`sklearn.cluster` +............................ + +- |API| : `kdtree` and `balltree` values are now deprecated and are renamed as + `kd_tree` and `ball_tree` respectively for the `algorithm` parameter of + :class:`cluster.HDBSCAN` ensuring consistency in naming convention. + `kdtree` and `balltree` values will be removed in 1.6. + :pr:`26744` by :user:`Shreesha Kumar Bhat `. + +:mod:`sklearn.cross_decomposition` +.................................. + +- |Fix| :class:`cross_decomposition.PLSRegression` now automatically ravels the output + of `predict` if fitted with one dimensional `y`. + :pr:`26602` by :user:`Yao Xiao `. + :mod:`sklearn.decomposition` ............................ @@ -80,6 +106,12 @@ Changelog :mod:`sklearn.ensemble` ....................... +- |MajorFeature| :class:`ensemble.RandomForestClassifier` and + :class:`ensemble.RandomForestRegressor` support missing values when + the criterion is `gini`, `entropy`, or `log_loss`, + for classification or `squared_error`, `friedman_mse`, or `poisson` + for regression. :pr:`26391` by `Thomas Fan`_. + - |Feature| :class:`ensemble.RandomForestClassifier`, :class:`ensemble.RandomForestRegressor`, :class:`ensemble.ExtraTreesClassifier` and :class:`ensemble.ExtraTreesRegressor` now support monotonic constraints, @@ -88,6 +120,11 @@ Changelog :pr:`13649` by :user:`Samuel Ronsin `, initiated by :user:`Patrick O'Reilly `. +- |Efficiency| Improves runtime and memory usage for + :class:`ensemble.GradientBoostingClassifier` and + :class:`ensemble.GradientBoostingRegressor` when trained on sparse data. + :pr:`26957` by `Thomas Fan`_. + :mod:`sklearn.feature_selection` ................................ @@ -120,6 +157,37 @@ Changelog object in the parameter grid if it's an estimator. :pr:`26786` by `Adrin Jalali`_. +- |Feature| :func:`~model_selection.cross_validate`, + :func:`~model_selection.cross_val_score`, and + :func:`~model_selection.cross_val_predict` now support metadata routing. The + metadata are routed to the estimator's `fit`, the scorer, and the CV + splitter's `split`. The metadata is accepted via the new `params` parameter. + `fit_params` is deprecated and will be removed in version 1.6. `groups` + parameter is also not accepted as a separate argument when metadata routing + is enabled and should be passed via the `params` parameter. :pr:`26896` by + `Adrin Jalali`_. + +:mod:`sklearn.neighbors` +........................ + +- |Fix| Neighbors based estimators now correctly work when `metric="minkowski"` and the + metric parameter `p` is in the range `0 < p < 1`, regardless of the `dtype` of `X`. + :pr:`26760` by :user:`Shreesha Kumar Bhat `. + +:mod:`sklearn.preprocessing` +............................ + +- |Efficiency| :class:`preprocessing.OrdinalEncoder` avoids calculating + missing indices twice to improve efficiency. + :pr:`27017` by :user:`Xuefeng Xu `. + +- |Fix| :class:`preprocessing.OneHotEncoder` shows a more informative error message + when `sparse_output=True` and the output is configured to be pandas. + :pr:`26931` by `Thomas Fan`_. + +- |Enhancement| :func:`sklearn.model_selection.train_test_split` now supports + Array API compatible inputs. :pr:`26855` by `Tim Head`_. + :mod:`sklearn.tree` ................... @@ -131,9 +199,37 @@ Changelog :pr:`13649` by :user:`Samuel Ronsin `, initiated by :user:`Patrick O'Reilly `. + +:mod:`sklearn.neighbors` +........................ + +- |API| :class:`neighbors.KNeighborsRegressor` now accepts + :class:`metric.DistanceMetric` objects directly via the `metric` keyword + argument allowing for the use of accelerated third-party + :class:`metric.DistanceMetric` objects. + :pr:`26267` by :user:`Meekail Zain ` + +:mod:`sklearn.metrics` +...................... + +- |Efficiency| Computing pairwise distances via :class:`metrics.DistanceMetric` + for CSR × CSR, Dense × CSR, and CSR × Dense datasets is now 1.5x faster. + :pr:`26765` by :user:`Meekail Zain ` + +- |Efficiency| Computing distances via :class:`metrics.DistanceMetric` + for CSR × CSR, Dense × CSR, and CSR × Dense now uses ~50% less memory, + and outputs distances in the same dtype as the provided data. + :pr:`27006` by :user:`Meekail Zain ` + :mod:`sklearn.utils` .................... +- |Enhancement| :func:`sklearn.utils.estimator_html_repr` dynamically adapts + diagram colors based on the browser's `prefers-color-scheme`, providing + improved adaptability to dark mode environments. + :pr:`26862` by :user:`Andrew Goh Yisheng <9y5>`, `Thomas Fan`_, `Adrin + Jalali`_. + - |Enhancement| :class:`~utils.metadata_routing.MetadataRequest` and :class:`~utils.metadata_routing.MetadataRouter` now have a ``consumes`` method which can be used to check whether a given set of parameters would be consumed. diff --git a/examples/classification/plot_classifier_comparison.py b/examples/classification/plot_classifier_comparison.py index 75164cff8b492..8d7eb7c63c81a 100644 --- a/examples/classification/plot_classifier_comparison.py +++ b/examples/classification/plot_classifier_comparison.py @@ -58,13 +58,15 @@ classifiers = [ KNeighborsClassifier(3), - SVC(kernel="linear", C=0.025), - SVC(gamma=2, C=1), - GaussianProcessClassifier(1.0 * RBF(1.0)), - DecisionTreeClassifier(max_depth=5), - RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), - MLPClassifier(alpha=1, max_iter=1000), - AdaBoostClassifier(), + SVC(kernel="linear", C=0.025, random_state=42), + SVC(gamma=2, C=1, random_state=42), + GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42), + DecisionTreeClassifier(max_depth=5, random_state=42), + RandomForestClassifier( + max_depth=5, n_estimators=10, max_features=1, random_state=42 + ), + MLPClassifier(alpha=1, max_iter=1000, random_state=42), + AdaBoostClassifier(random_state=42), GaussianNB(), QuadraticDiscriminantAnalysis(), ] diff --git a/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py b/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py index 7eab9a3437d65..0dde24116065d 100644 --- a/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py +++ b/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py @@ -12,7 +12,7 @@ trees according to each estimator: - `n_estimators` controls the number of trees in the forest. It's a fixed number. -- `max_iter` is the the maximum number of iterations in a gradient boosting +- `max_iter` is the maximum number of iterations in a gradient boosting based model. The number of iterations corresponds to the number of trees for regression and binary classification problems. Furthermore, the actual number of trees required by the model depends on the stopping criteria. @@ -210,7 +210,7 @@ # models uniformly dominate the Random Forest models in the "test score vs # training speed trade-off" (the HGBDT curve should be on the top left of the RF # curve, without ever crossing). The "test score vs prediction speed" trade-off -# can also be more disputed but it's most often favorable to HGBDT. It's always +# can also be more disputed, but it's most often favorable to HGBDT. It's always # a good idea to check both kinds of model (with hyper-parameter tuning) and # compare their performance on your specific problem to determine which model is # the best fit but **HGBT almost always offers a more favorable speed-accuracy diff --git a/examples/feature_selection/plot_select_from_model_diabetes.py b/examples/feature_selection/plot_select_from_model_diabetes.py index 688c2b4ba8079..f008d8d6e8b68 100644 --- a/examples/feature_selection/plot_select_from_model_diabetes.py +++ b/examples/feature_selection/plot_select_from_model_diabetes.py @@ -122,9 +122,6 @@ print(f"Done in {toc_bwd - tic_bwd:.3f}s") # %% -# Discussion -# ---------- -# # Interestingly, forward and backward selection have selected the same set of # features. In general, this isn't the case and the two methods would lead to # different results. @@ -145,3 +142,54 @@ # attribute. The forward SFS is faster than the backward SFS because it only # needs to perform `n_features_to_select = 2` iterations, while the backward # SFS needs to perform `n_features - n_features_to_select = 8` iterations. +# +# Using negative tolerance values +# ------------------------------- +# +# :class:`~sklearn.feature_selection.SequentialFeatureSelector` can be used +# to remove features present in the dataset and return a +# smaller subset of the original features with `direction="backward"` +# and a negative value of `tol`. +# +# We begin by loading the Breast Cancer dataset, consisting of 30 different +# features and 569 samples. +import numpy as np + +from sklearn.datasets import load_breast_cancer + +breast_cancer_data = load_breast_cancer() +X, y = breast_cancer_data.data, breast_cancer_data.target +feature_names = np.array(breast_cancer_data.feature_names) +print(breast_cancer_data.DESCR) + +# %% +# We will make use of the :class:`~sklearn.linear_model.LogisticRegression` +# estimator with :class:`~sklearn.feature_selection.SequentialFeatureSelector` +# to perform the feature selection. +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import roc_auc_score +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler + +for tol in [-1e-2, -1e-3, -1e-4]: + start = time() + feature_selector = SequentialFeatureSelector( + LogisticRegression(), + n_features_to_select="auto", + direction="backward", + scoring="roc_auc", + tol=tol, + n_jobs=2, + ) + model = make_pipeline(StandardScaler(), feature_selector, LogisticRegression()) + model.fit(X, y) + end = time() + print(f"\ntol: {tol}") + print(f"Features selected: {feature_names[model[1].get_support()]}") + print(f"ROC AUC score: {roc_auc_score(y, model.predict_proba(X)[:, 1]):.3f}") + print(f"Done in {end - start:.3f}s") + +# %% +# We can see that the number of features selected tend to increase as negative +# values of `tol` approach to zero. The time taken for feature selection also +# decreases as the values of `tol` come closer to zero. diff --git a/examples/miscellaneous/plot_metadata_routing.py b/examples/miscellaneous/plot_metadata_routing.py index 350cd865d972e..9984bb6183348 100644 --- a/examples/miscellaneous/plot_metadata_routing.py +++ b/examples/miscellaneous/plot_metadata_routing.py @@ -447,7 +447,7 @@ def get_metadata_routing(self): return router def fit(self, X, y, **fit_params): - params = process_routing(self, "fit", fit_params) + params = process_routing(self, "fit", **fit_params) self.transformer_ = clone(self.transformer).fit(X, y, **params.transformer.fit) X_transformed = self.transformer_.transform(X, **params.transformer.transform) @@ -458,7 +458,7 @@ def fit(self, X, y, **fit_params): return self def predict(self, X, **predict_params): - params = process_routing(self, "predict", predict_params) + params = process_routing(self, "predict", **predict_params) X_transformed = self.transformer_.transform(X, **params.transformer.transform) return self.classifier_.predict(X_transformed, **params.classifier.predict) @@ -543,7 +543,7 @@ def __init__(self, estimator): self.estimator = estimator def fit(self, X, y, **fit_params): - params = process_routing(self, "fit", fit_params) + params = process_routing(self, "fit", **fit_params) self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) def get_metadata_routing(self): @@ -572,7 +572,7 @@ def __init__(self, estimator): self.estimator = estimator def fit(self, X, y, sample_weight=None, **fit_params): - params = process_routing(self, "fit", fit_params, sample_weight=sample_weight) + params = process_routing(self, "fit", sample_weight=sample_weight, **fit_params) check_metadata(self, sample_weight=sample_weight) self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) diff --git a/examples/neighbors/plot_classification.py b/examples/neighbors/plot_classification.py index 4ed23862ae455..43c45558054cf 100644 --- a/examples/neighbors/plot_classification.py +++ b/examples/neighbors/plot_classification.py @@ -3,61 +3,92 @@ Nearest Neighbors Classification ================================ -Sample usage of Nearest Neighbors classification. -It will plot the decision boundaries for each class. - +This example shows how to use :class:`~sklearn.neighbors.KNeighborsClassifier`. +We train such a classifier on the iris dataset and observe the difference of the +decision boundary obtained with regards to the parameter `weights`. """ -import matplotlib.pyplot as plt -import seaborn as sns -from matplotlib.colors import ListedColormap +# %% +# Load the data +# ------------- +# +# In this example, we use the iris dataset. We split the data into a train and test +# dataset. +from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split -from sklearn import datasets, neighbors -from sklearn.inspection import DecisionBoundaryDisplay +iris = load_iris(as_frame=True) +X = iris.data[["sepal length (cm)", "sepal width (cm)"]] +y = iris.target +X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) -n_neighbors = 15 +# %% +# K-nearest neighbors classifier +# ------------------------------ +# +# We want to use a k-nearest neighbors classifier considering a neighborhood of 11 data +# points. Since our k-nearest neighbors model uses euclidean distance to find the +# nearest neighbors, it is therefore important to scale the data beforehand. Refer to +# the example entitled +# :ref:`sphx_glr_auto_examples_preprocessing_plot_scaling_importance.py` for more +# detailed information. +# +# Thus, we use a :class:`~sklearn.pipeline.Pipeline` to chain a scaler before to use +# our classifier. +from sklearn.neighbors import KNeighborsClassifier +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler -# import some data to play with -iris = datasets.load_iris() +clf = Pipeline( + steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=11))] +) -# we only take the first two features. We could avoid this ugly -# slicing by using a two-dim dataset -X = iris.data[:, :2] -y = iris.target +# %% +# Decision boundary +# ----------------- +# +# Now, we fit two classifiers with different values of the parameter +# `weights`. We plot the decision boundary of each classifier as well as the original +# dataset to observe the difference. +import matplotlib.pyplot as plt -# Create color maps -cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"]) -cmap_bold = ["darkorange", "c", "darkblue"] +from sklearn.inspection import DecisionBoundaryDisplay -for weights in ["uniform", "distance"]: - # we create an instance of Neighbours Classifier and fit the data. - clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) - clf.fit(X, y) +_, axs = plt.subplots(ncols=2, figsize=(12, 5)) - _, ax = plt.subplots() - DecisionBoundaryDisplay.from_estimator( +for ax, weights in zip(axs, ("uniform", "distance")): + clf.set_params(knn__weights=weights).fit(X_train, y_train) + disp = DecisionBoundaryDisplay.from_estimator( clf, - X, - cmap=cmap_light, - ax=ax, + X_test, response_method="predict", plot_method="pcolormesh", xlabel=iris.feature_names[0], ylabel=iris.feature_names[1], shading="auto", + alpha=0.5, + ax=ax, ) - - # Plot also the training points - sns.scatterplot( - x=X[:, 0], - y=X[:, 1], - hue=iris.target_names[y], - palette=cmap_bold, - alpha=1.0, - edgecolor="black", + scatter = disp.ax_.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, edgecolors="k") + disp.ax_.legend( + scatter.legend_elements()[0], + iris.target_names, + loc="lower left", + title="Classes", ) - plt.title( - "3-Class classification (k = %i, weights = '%s')" % (n_neighbors, weights) + _ = disp.ax_.set_title( + f"3-Class classification\n(k={clf[-1].n_neighbors}, weights={weights!r})" ) plt.show() + +# %% +# Conclusion +# ---------- +# +# We observe that the parameter `weights` has an impact on the decision boundary. When +# `weights="unifom"` all nearest neighbors will have the same impact on the decision. +# Whereas when `weights="distance"` the weight given to each neighbor is proportional +# to the inverse of the distance from that neighbor to the query point. +# +# In some cases, taking the distance into account might improve the model. diff --git a/examples/preprocessing/plot_target_encoder_cross_val.py b/examples/preprocessing/plot_target_encoder_cross_val.py index f4ff643d8b48e..7244a1bf61cd6 100644 --- a/examples/preprocessing/plot_target_encoder_cross_val.py +++ b/examples/preprocessing/plot_target_encoder_cross_val.py @@ -6,21 +6,26 @@ .. currentmodule:: sklearn.preprocessing The :class:`TargetEncoder` replaces each category of a categorical feature with -the mean of the target variable for that category. This method is useful +the shrunk mean of the target variable for that category. This method is useful in cases where there is a strong relationship between the categorical feature and the target. To prevent overfitting, :meth:`TargetEncoder.fit_transform` uses -an internal cross fitting scheme to encode the training data to be used by a -downstream model. In this example, we demonstrate the importance of the cross fitting -procedure to prevent overfitting. +an internal :term:`cross fitting` scheme to encode the training data to be used +by a downstream model. This scheme involves splitting the data into *k* folds +and encoding each fold using the encodings learnt using the other *k-1* folds. +In this example, we demonstrate the importance of the cross +fitting procedure to prevent overfitting. """ # %% # Create Synthetic Dataset # ======================== -# For this example, we build a dataset with three categorical features: an informative -# feature with medium cardinality, an uninformative feature with medium cardinality, -# and an uninformative feature with high cardinality. First, we generate the informative -# feature: +# For this example, we build a dataset with three categorical features: +# +# * an informative feature with medium cardinality ("informative") +# * an uninformative feature with medium cardinality ("shuffled") +# * an uninformative feature with high cardinality ("near_unique") +# +# First, we generate the informative feature: import numpy as np from sklearn.preprocessing import KBinsDiscretizer @@ -33,12 +38,16 @@ n_categories = 100 kbins = KBinsDiscretizer( - n_bins=n_categories, encode="ordinal", strategy="uniform", random_state=rng + n_bins=n_categories, + encode="ordinal", + strategy="uniform", + random_state=rng, + subsample=None, ) X_informative = kbins.fit_transform((y + noise).reshape(-1, 1)) -# Remove the linear relationship between y and the bin index by permuting the values of -# X_informative +# Remove the linear relationship between y and the bin index by permuting the +# values of X_informative: permuted_categories = rng.permutation(n_categories) X_informative = permuted_categories[X_informative.astype(np.int32)] @@ -48,13 +57,13 @@ X_shuffled = rng.permutation(X_informative) # %% -# The uninformative feature with high cardinality is generated so that is independent of -# the target variable. We will show that target encoding without cross fitting will -# cause catastrophic overfitting for the downstream regressor. These high cardinality -# features are basically unique identifiers for samples which should generally be -# removed from machine learning dataset. In this example, we generate them to show how -# :class:`TargetEncoder`'s default cross fitting behavior mitigates the overfitting -# issue automatically. +# The uninformative feature with high cardinality is generated so that it is +# independent of the target variable. We will show that target encoding without +# :term:`cross fitting` will cause catastrophic overfitting for the downstream +# regressor. These high cardinality features are basically unique identifiers +# for samples which should generally be removed from machine learning datasets. +# In this example, we generate them to show how :class:`TargetEncoder`'s default +# :term:`cross fitting` behavior mitigates the overfitting issue automatically. X_near_unique_categories = rng.choice( int(0.9 * n_samples), size=n_samples, replace=True ).reshape(-1, 1) @@ -79,9 +88,10 @@ # ========================== # In this section, we train a ridge regressor on the dataset with and without # encoding and explore the influence of target encoder with and without the -# internal cross fitting. First, we see the Ridge model trained on the -# raw features will have low performance, because the order of the informative -# feature is not informative: +# internal :term:`cross fitting`. First, we see the Ridge model trained on the +# raw features will have low performance. This is because we permuted the order +# of the informative feature meaning `X_informative` is not informative when +# raw: import sklearn from sklearn.linear_model import Ridge @@ -96,15 +106,15 @@ # %% # Next, we create a pipeline with the target encoder and ridge model. The pipeline -# uses :meth:`TargetEncoder.fit_transform` which uses cross fitting. We see that -# the model fits the data well and generalizes to the test set: +# uses :meth:`TargetEncoder.fit_transform` which uses :term:`cross fitting`. We +# see that the model fits the data well and generalizes to the test set: from sklearn.pipeline import make_pipeline from sklearn.preprocessing import TargetEncoder -model_with_cv = make_pipeline(TargetEncoder(random_state=0), ridge) -model_with_cv.fit(X_train, y_train) -print("Model with CV on training set: ", model_with_cv.score(X_train, y_train)) -print("Model with CV on test set: ", model_with_cv.score(X_test, y_test)) +model_with_cf = make_pipeline(TargetEncoder(random_state=0), ridge) +model_with_cf.fit(X_train, y_train) +print("Model with CF on train set: ", model_with_cf.score(X_train, y_train)) +print("Model with CF on test set: ", model_with_cf.score(X_test, y_test)) # %% # The coefficients of the linear model shows that most of the weight is on the @@ -114,49 +124,68 @@ plt.rcParams["figure.constrained_layout.use"] = True -coefs_cv = pd.Series( - model_with_cv[-1].coef_, index=model_with_cv[-1].feature_names_in_ +coefs_cf = pd.Series( + model_with_cf[-1].coef_, index=model_with_cf[-1].feature_names_in_ ).sort_values() -_ = coefs_cv.plot(kind="barh") +ax = coefs_cf.plot(kind="barh") +_ = ax.set( + title="Target encoded with cross fitting", + xlabel="Ridge coefficient", + ylabel="Feature", +) # %% -# While :meth:`TargetEncoder.fit_transform` uses an internal cross fitting scheme, -# :meth:`TargetEncoder.transform` itself does not perform any cross fitting. -# It uses the aggregation of the complete training set to transform the categorical -# features. Thus, we can use :meth:`TargetEncoder.fit` followed by -# :meth:`TargetEncoder.transform` to disable the cross fitting. This encoding -# is then passed to the ridge model. +# While :meth:`TargetEncoder.fit_transform` uses an internal +# :term:`cross fitting` scheme to learn encodings for the training set, +# :meth:`TargetEncoder.transform` itself does not. +# It uses the complete training set to learn encodings and to transform the +# categorical features. Thus, we can use :meth:`TargetEncoder.fit` followed by +# :meth:`TargetEncoder.transform` to disable the :term:`cross fitting`. This +# encoding is then passed to the ridge model. target_encoder = TargetEncoder(random_state=0) target_encoder.fit(X_train, y_train) -X_train_no_cv_encoding = target_encoder.transform(X_train) -X_test_no_cv_encoding = target_encoder.transform(X_test) +X_train_no_cf_encoding = target_encoder.transform(X_train) +X_test_no_cf_encoding = target_encoder.transform(X_test) -model_no_cv = ridge.fit(X_train_no_cv_encoding, y_train) +model_no_cf = ridge.fit(X_train_no_cf_encoding, y_train) # %% -# We evaluate the model on the non-cross validated encoding and see that it overfits: +# We evaluate the model that did not use :term:`cross fitting` when encoding and +# see that it overfits: print( - "Model without CV on training set: ", - model_no_cv.score(X_train_no_cv_encoding, y_train), + "Model without CF on training set: ", + model_no_cf.score(X_train_no_cf_encoding, y_train), ) print( - "Model without CV on test set: ", model_no_cv.score(X_test_no_cv_encoding, y_test) + "Model without CF on test set: ", + model_no_cf.score( + X_test_no_cf_encoding, + y_test, + ), ) # %% -# The ridge model overfits, because it assigns more weight to the extremely high -# cardinality feature relative to the informative feature. -coefs_no_cv = pd.Series( - model_no_cv.coef_, index=model_no_cv.feature_names_in_ +# The ridge model overfits because it assigns much more weight to the +# uninformative extremely high cardinality ("near_unique") and medium +# cardinality ("shuffled") features than when the model used +# :term:`cross fitting` to encode the features. +coefs_no_cf = pd.Series( + model_no_cf.coef_, index=model_no_cf.feature_names_in_ ).sort_values() -_ = coefs_no_cv.plot(kind="barh") +ax = coefs_no_cf.plot(kind="barh") +_ = ax.set( + title="Target encoded without cross fitting", + xlabel="Ridge coefficient", + ylabel="Feature", +) # %% # Conclusion # ========== -# This example demonstrates the importance of :class:`TargetEncoder`'s internal cross -# fitting. It is important to use :meth:`TargetEncoder.fit_transform` to encode -# training data before passing it to a machine learning model. When a -# :class:`TargetEncoder` is a part of a :class:`~sklearn.pipeline.Pipeline` and the -# pipeline is fitted, the pipeline will correctly call -# :meth:`TargetEncoder.fit_transform` and pass the encoding along. +# This example demonstrates the importance of :class:`TargetEncoder`'s internal +# :term:`cross fitting`. It is important to use +# :meth:`TargetEncoder.fit_transform` to encode training data before passing it +# to a machine learning model. When a :class:`TargetEncoder` is a part of a +# :class:`~sklearn.pipeline.Pipeline` and the pipeline is fitted, the pipeline +# will correctly call :meth:`TargetEncoder.fit_transform` and use +# :term:`cross fitting` when encoding the training data. diff --git a/examples/release_highlights/plot_release_highlights_0_23_0.py b/examples/release_highlights/plot_release_highlights_0_23_0.py index 7c6836632e3f0..d7ae7465a590b 100644 --- a/examples/release_highlights/plot_release_highlights_0_23_0.py +++ b/examples/release_highlights/plot_release_highlights_0_23_0.py @@ -1,4 +1,4 @@ -# flake8: noqa +# ruff: noqa """ ======================================== Release Highlights for scikit-learn 0.23 diff --git a/examples/release_highlights/plot_release_highlights_0_24_0.py b/examples/release_highlights/plot_release_highlights_0_24_0.py index a55b4aabc7994..29082c1a078f4 100644 --- a/examples/release_highlights/plot_release_highlights_0_24_0.py +++ b/examples/release_highlights/plot_release_highlights_0_24_0.py @@ -1,4 +1,4 @@ -# flake8: noqa +# ruff: noqa """ ======================================== Release Highlights for scikit-learn 0.24 diff --git a/examples/release_highlights/plot_release_highlights_1_0_0.py b/examples/release_highlights/plot_release_highlights_1_0_0.py index 383612e611688..7ac09dd193c0f 100644 --- a/examples/release_highlights/plot_release_highlights_1_0_0.py +++ b/examples/release_highlights/plot_release_highlights_1_0_0.py @@ -1,4 +1,4 @@ -# flake8: noqa +# ruff: noqa """ ======================================= Release Highlights for scikit-learn 1.0 diff --git a/examples/release_highlights/plot_release_highlights_1_1_0.py b/examples/release_highlights/plot_release_highlights_1_1_0.py index f6432cf15037c..b3058a7e0aa27 100644 --- a/examples/release_highlights/plot_release_highlights_1_1_0.py +++ b/examples/release_highlights/plot_release_highlights_1_1_0.py @@ -1,4 +1,4 @@ -# flake8: noqa +# ruff: noqa """ ======================================= Release Highlights for scikit-learn 1.1 diff --git a/examples/release_highlights/plot_release_highlights_1_2_0.py b/examples/release_highlights/plot_release_highlights_1_2_0.py index 8165c3bc4eed0..695e74cfcdd64 100644 --- a/examples/release_highlights/plot_release_highlights_1_2_0.py +++ b/examples/release_highlights/plot_release_highlights_1_2_0.py @@ -1,4 +1,4 @@ -# flake8: noqa +# ruff: noqa """ ======================================= Release Highlights for scikit-learn 1.2 diff --git a/examples/release_highlights/plot_release_highlights_1_3_0.py b/examples/release_highlights/plot_release_highlights_1_3_0.py index 8fa1ea057ac91..5ce2617cd08aa 100644 --- a/examples/release_highlights/plot_release_highlights_1_3_0.py +++ b/examples/release_highlights/plot_release_highlights_1_3_0.py @@ -1,4 +1,4 @@ -# flake8: noqa +# ruff: noqa """ ======================================= Release Highlights for scikit-learn 1.3 diff --git a/pyproject.toml b/pyproject.toml index efd72adf44392..c98ed2130189f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,7 +83,7 @@ exclude=[ # + E501 (line too long) because keeping it < 88 in cython # often makes code less readable. ignore = [ - # check ignored by default in flake8. Meaning unclear. + # multiple spaces/tab after comma 'E24', # space before : (needed for how black formats slicing) 'E203', diff --git a/setup.cfg b/setup.cfg index d91a27344c575..b7705781dbb7d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,7 +20,6 @@ addopts = # correctly on the CI when running `pytest --pyargs sklearn` from the # source folder. -p sklearn.tests.random_seed - -rN filterwarnings = ignore:the matrix subclass:PendingDeprecationWarning @@ -54,6 +53,9 @@ ignore = sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx + sklearn/neighbors/_ball_tree.pyx + sklearn/neighbors/_binary_tree.pxi + sklearn/neighbors/_kd_tree.pyx [codespell] diff --git a/setup.py b/setup.py index c41883aa5c37a..5c008944ec05c 100644 --- a/setup.py +++ b/setup.py @@ -306,8 +306,9 @@ def check_package_status(package, min_version): }, ], "neighbors": [ - {"sources": ["_ball_tree.pyx"], "include_np": True}, - {"sources": ["_kd_tree.pyx"], "include_np": True}, + {"sources": ["_binary_tree.pxi.tp"], "include_np": True}, + {"sources": ["_ball_tree.pyx.tp"], "include_np": True}, + {"sources": ["_kd_tree.pyx.tp"], "include_np": True}, {"sources": ["_partition_nodes.pyx"], "language": "c++", "include_np": True}, {"sources": ["_quad_tree.pyx"], "language": "c++", "include_np": True}, ], @@ -514,13 +515,18 @@ def configure_extension_modules(): # `source` is a Tempita file tempita_sources.append(source) - # Do not include pxd files that were generated by tempita - if os.path.splitext(new_source_path)[-1] == ".pxd": - continue - sources.append(new_source_path) + # Only include source files that are pyx files + if os.path.splitext(new_source_path)[-1] == ".pyx": + sources.append(new_source_path) gen_from_templates(tempita_sources) + # Do not progress if we only have a tempita file which we don't + # want to include like the .pxi.tp extension. In such a case + # sources would be empty. + if not sources: + continue + # By convention, our extensions always use the name of the first source source_name = os.path.splitext(os.path.basename(sources[0]))[0] if submodule: diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py index f3b61da0915d5..11cb0e42c47f6 100644 --- a/sklearn/_loss/loss.py +++ b/sklearn/_loss/loss.py @@ -113,7 +113,7 @@ class BaseLoss: Indicates whether n_classes > 2 is allowed. """ - # For decision trees: + # For gradient boosted decision trees: # This variable indicates whether the loss requires the leaves values to # be updated once the tree has been trained. The trees are trained to # predict a Newton-Raphson step (see grower._finalize_leaf()). But for @@ -122,8 +122,8 @@ class BaseLoss: # procedure. See the original paper Greedy Function Approximation: A # Gradient Boosting Machine by Friedman # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory. - need_update_leaves_values = False differentiable = True + need_update_leaves_values = False is_multiclass = False def __init__(self, closs, link, n_classes=None): @@ -543,6 +543,10 @@ class AbsoluteError(BaseLoss): For a given sample x_i, the absolute error is defined as:: loss(x_i) = |y_true_i - raw_prediction_i| + + Note that the exact hessian = 0 almost everywhere (except at one point, therefore + differentiable = False). Optimization routines like in HGBT, however, need a + hessian > 0. Therefore, we assign 1. """ differentiable = False @@ -585,6 +589,10 @@ class PinballLoss(BaseLoss): Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError(). + Note that the exact hessian = 0 almost everywhere (except at one point, therefore + differentiable = False). Optimization routines like in HGBT, however, need a + hessian > 0. Therefore, we assign 1. + Additional Attributes --------------------- quantile : float diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 432ca9e25b152..8d9a964aea172 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -378,10 +378,10 @@ def fit(self, X, y, sample_weight=None, **fit_params): if _routing_enabled(): routed_params = process_routing( - obj=self, - method="fit", + self, + "fit", sample_weight=sample_weight, - other_params=fit_params, + **fit_params, ) else: # sample_weight checks @@ -450,7 +450,7 @@ def fit(self, X, y, sample_weight=None, **fit_params): cv=cv, method=method_name, n_jobs=self.n_jobs, - fit_params=routed_params.estimator.fit, + params=routed_params.estimator.fit, ) predictions = _compute_predictions( pred_method, method_name, X, n_classes @@ -1186,7 +1186,7 @@ def plot(self, *, ax=None, name=None, ref_line=True, **kwargs): f"(Positive class: {self.pos_label})" if self.pos_label is not None else "" ) - line_kwargs = {} + line_kwargs = {"marker": "s", "linestyle": "-"} if name is not None: line_kwargs["label"] = name line_kwargs.update(**kwargs) @@ -1195,9 +1195,7 @@ def plot(self, *, ax=None, name=None, ref_line=True, **kwargs): existing_ref_line = ref_line_label in self.ax_.get_legend_handles_labels()[1] if ref_line and not existing_ref_line: self.ax_.plot([0, 1], [0, 1], "k:", label=ref_line_label) - self.line_ = self.ax_.plot(self.prob_pred, self.prob_true, "s-", **line_kwargs)[ - 0 - ] + self.line_ = self.ax_.plot(self.prob_pred, self.prob_true, **line_kwargs)[0] # We always have to show the legend for at least the reference line self.ax_.legend(loc="lower right") diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index 7280bc31423ae..4dd09c9531c44 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -22,6 +22,8 @@ from ._dbscan_inner import dbscan_inner +# This function is not validated using validate_params because +# it's just a factory for DBSCAN. def dbscan( X, eps=0.5, @@ -172,6 +174,9 @@ class DBSCAN(ClusterMixin, BaseEstimator): Finds core samples of high density and expands clusters from them. Good for data which contains clusters of similar density. + The worst case memory complexity of DBSCAN is :math:`O({n}^2)`, which can + occur when the `eps` param is large and `min_samples` is low. + Read more in the :ref:`User Guide `. Parameters @@ -184,8 +189,11 @@ class DBSCAN(ClusterMixin, BaseEstimator): and distance function. min_samples : int, default=5 - The number of samples (or total weight) in a neighborhood for a point - to be considered as a core point. This includes the point itself. + The number of samples (or total weight) in a neighborhood for a point to + be considered as a core point. This includes the point itself. If + `min_samples` is set to a higher value, DBSCAN will find denser clusters, + whereas if it is set to a lower value, the found clusters will be more + sparse. metric : str, or callable, default='euclidean' The metric to use when calculating distance between instances in a diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 57de8962250b1..f8a37c52f55dc 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -462,12 +462,12 @@ class HDBSCAN(ClusterMixin, BaseEstimator): A distance scaling parameter as used in robust single linkage. See [3]_ for more information. - algorithm : {"auto", "brute", "kdtree", "balltree"}, default="auto" + algorithm : {"auto", "brute", "kd_tree", "ball_tree"}, default="auto" Exactly which algorithm to use for computing core distances; By default this is set to `"auto"` which attempts to use a :class:`~sklearn.neighbors.KDTree` tree if possible, otherwise it uses - a :class:`~sklearn.neighbors.BallTree` tree. Both `"KDTree"` and - `"BallTree"` algorithms use the + a :class:`~sklearn.neighbors.BallTree` tree. Both `"kd_tree"` and + `"ball_tree"` algorithms use the :class:`~sklearn.neighbors.NearestNeighbors` estimator. If the `X` passed during `fit` is sparse or `metric` is invalid for @@ -475,6 +475,14 @@ class HDBSCAN(ClusterMixin, BaseEstimator): :class:`~sklearn.neighbors.BallTree`, then it resolves to use the `"brute"` algorithm. + .. deprecated:: 1.4 + The `'kdtree'` option was deprecated in version 1.4, + and will be renamed to `'kd_tree'` in 1.6. + + .. deprecated:: 1.4 + The `'balltree'` option was deprecated in version 1.4, + and will be renamed to `'ball_tree'` in 1.6. + leaf_size : int, default=40 Leaf size for trees responsible for fast nearest neighbour queries when a KDTree or a BallTree are used as core-distance algorithms. A large @@ -625,15 +633,12 @@ class HDBSCAN(ClusterMixin, BaseEstimator): "metric": [StrOptions(FAST_METRICS | {"precomputed"}), callable], "metric_params": [dict, None], "alpha": [Interval(Real, left=0, right=None, closed="neither")], + # TODO(1.6): Remove "kdtree" and "balltree" option "algorithm": [ StrOptions( - { - "auto", - "brute", - "kdtree", - "balltree", - } - ) + {"auto", "brute", "kd_tree", "ball_tree", "kdtree", "balltree"}, + deprecated={"kdtree", "balltree"}, + ), ], "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], "n_jobs": [Integral, None], @@ -759,6 +764,31 @@ def fit(self, X, y=None): f"min_samples ({self._min_samples}) must be at most the number of" f" samples in X ({X.shape[0]})" ) + + # TODO(1.6): Remove + if self.algorithm == "kdtree": + warn( + ( + "`algorithm='kdtree'`has been deprecated in 1.4 and will be renamed" + " to'kd_tree'`in 1.6. To keep the past behaviour, set" + " `algorithm='kd_tree'`." + ), + FutureWarning, + ) + self.algorithm = "kd_tree" + + # TODO(1.6): Remove + if self.algorithm == "balltree": + warn( + ( + "`algorithm='balltree'`has been deprecated in 1.4 and will be" + " renamed to'ball_tree'`in 1.6. To keep the past behaviour, set" + " `algorithm='ball_tree'`." + ), + FutureWarning, + ) + self.algorithm = "ball_tree" + mst_func = None kwargs = dict( X=X, @@ -768,12 +798,14 @@ def fit(self, X, y=None): n_jobs=self.n_jobs, **self._metric_params, ) - if self.algorithm == "kdtree" and self.metric not in KDTree.valid_metrics: + if self.algorithm == "kd_tree" and self.metric not in KDTree.valid_metrics: raise ValueError( f"{self.metric} is not a valid metric for a KDTree-based algorithm." " Please select a different metric." ) - elif self.algorithm == "balltree" and self.metric not in BallTree.valid_metrics: + elif ( + self.algorithm == "ball_tree" and self.metric not in BallTree.valid_metrics + ): raise ValueError( f"{self.metric} is not a valid metric for a BallTree-based algorithm." " Please select a different metric." @@ -790,11 +822,11 @@ def fit(self, X, y=None): if self.algorithm == "brute": mst_func = _hdbscan_brute kwargs["copy"] = self.copy - elif self.algorithm == "kdtree": + elif self.algorithm == "kd_tree": mst_func = _hdbscan_prims kwargs["algo"] = "kd_tree" kwargs["leaf_size"] = self.leaf_size - elif self.algorithm == "balltree": + else: mst_func = _hdbscan_prims kwargs["algo"] = "ball_tree" kwargs["leaf_size"] = self.leaf_size diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index c0c281ce31475..63087e75185dc 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -28,8 +28,8 @@ X = StandardScaler().fit_transform(X) ALGORITHMS = [ - "kdtree", - "balltree", + "kd_tree", + "ball_tree", "brute", "auto", ] @@ -149,8 +149,8 @@ def test_hdbscan_algorithms(algo, metric): return ALGOS_TREES = { - "kdtree": KDTree, - "balltree": BallTree, + "kd_tree": KDTree, + "ball_tree": BallTree, } metric_params = { "mahalanobis": {"V": np.eye(X.shape[1])}, @@ -287,22 +287,37 @@ def test_hdbscan_precomputed_non_brute(tree): def test_hdbscan_sparse(): """ Tests that HDBSCAN works correctly when passing sparse feature data. + Evaluates correctness by comparing against the same data passed as a dense + array. """ - sparse_X = sparse.csr_matrix(X) - labels = HDBSCAN().fit(sparse_X).labels_ - n_clusters = len(set(labels) - OUTLIER_SET) + dense_labels = HDBSCAN().fit(X).labels_ + n_clusters = len(set(dense_labels) - OUTLIER_SET) assert n_clusters == 3 - sparse_X_nan = sparse_X.copy() - sparse_X_nan[0, 0] = np.nan - labels = HDBSCAN().fit(sparse_X_nan).labels_ - n_clusters = len(set(labels) - OUTLIER_SET) - assert n_clusters == 3 + _X_sparse = sparse.csr_matrix(X) + X_sparse = _X_sparse.copy() + sparse_labels = HDBSCAN().fit(X_sparse).labels_ + assert_array_equal(dense_labels, sparse_labels) + + # Compare that the sparse and dense non-precomputed routines return the same labels + # where the 0th observation contains the outlier. + for outlier_val, outlier_type in ((np.inf, "infinite"), (np.nan, "missing")): + X_dense = X.copy() + X_dense[0, 0] = outlier_val + dense_labels = HDBSCAN().fit(X_dense).labels_ + n_clusters = len(set(dense_labels) - OUTLIER_SET) + assert n_clusters == 3 + assert dense_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"] + + X_sparse = _X_sparse.copy() + X_sparse[0, 0] = outlier_val + sparse_labels = HDBSCAN().fit(X_sparse).labels_ + assert_array_equal(dense_labels, sparse_labels) msg = "Sparse data matrices only support algorithm `brute`." with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="euclidean", algorithm="balltree").fit(sparse_X) + HDBSCAN(metric="euclidean", algorithm="ball_tree").fit(X_sparse) @pytest.mark.parametrize("algorithm", ALGORITHMS) @@ -353,7 +368,7 @@ def test_hdbscan_allow_single_cluster_with_epsilon(): cluster_selection_epsilon=0.18, cluster_selection_method="eom", allow_single_cluster=True, - algorithm="kdtree", + algorithm="kd_tree", ).fit_predict(no_structure) unique_labels, counts = np.unique(labels, return_counts=True) assert len(unique_labels) == 2 @@ -418,16 +433,16 @@ def test_hdbscan_tree_invalid_metric(): # Callables are not supported for either with pytest.raises(ValueError, match=msg): - HDBSCAN(algorithm="kdtree", metric=metric_callable).fit(X) + HDBSCAN(algorithm="kd_tree", metric=metric_callable).fit(X) with pytest.raises(ValueError, match=msg): - HDBSCAN(algorithm="balltree", metric=metric_callable).fit(X) + HDBSCAN(algorithm="ball_tree", metric=metric_callable).fit(X) # The set of valid metrics for KDTree at the time of writing this test is a # strict subset of those supported in BallTree metrics_not_kd = list(set(BallTree.valid_metrics) - set(KDTree.valid_metrics)) if len(metrics_not_kd) > 0: with pytest.raises(ValueError, match=msg): - HDBSCAN(algorithm="kdtree", metric=metrics_not_kd[0]).fit(X) + HDBSCAN(algorithm="kd_tree", metric=metrics_not_kd[0]).fit(X) def test_hdbscan_too_many_min_samples(): @@ -531,3 +546,23 @@ def test_labelling_thresholding(): # and the largest value is exactly MAX_LAMBDA. num_noise = condensed_tree["value"] < MAX_LAMBDA assert sum(num_noise) == sum(labels == -1) + + +# TODO(1.6): Remove +def test_hdbscan_warning_on_deprecated_algorithm_name(): + # Test that warning message is shown when algorithm='kdtree' + msg = ( + "`algorithm='kdtree'`has been deprecated in 1.4 and will be renamed" + " to'kd_tree'`in 1.6. To keep the past behaviour, set `algorithm='kd_tree'`." + ) + with pytest.warns(FutureWarning, match=msg): + HDBSCAN(algorithm="kdtree").fit(X) + + # Test that warning message is shown when algorithm='balltree' + msg = ( + "`algorithm='balltree'`has been deprecated in 1.4 and will be renamed" + " to'ball_tree'`in 1.6. To keep the past behaviour, set" + " `algorithm='ball_tree'`." + ) + with pytest.warns(FutureWarning, match=msg): + HDBSCAN(algorithm="balltree").fit(X) diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index f1fc90af11d82..822a13064bb08 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -238,7 +238,10 @@ def fit(self, X, Y): Y, input_name="Y", dtype=np.float64, copy=self.copy, ensure_2d=False ) if Y.ndim == 1: + self._predict_1d = True Y = Y.reshape(-1, 1) + else: + self._predict_1d = False n = X.shape[0] p = X.shape[1] @@ -469,8 +472,8 @@ def predict(self, X, copy=True): # Normalize X -= self._x_mean X /= self._x_std - Ypred = X @ self.coef_.T - return Ypred + self.intercept_ + Ypred = X @ self.coef_.T + self.intercept_ + return Ypred.ravel() if self._predict_1d else Ypred def fit_transform(self, X, y=None): """Learn and apply the dimension reduction on the train data. diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py index fcdd927efb389..b8b5cbaa0f275 100644 --- a/sklearn/cross_decomposition/tests/test_pls.py +++ b/sklearn/cross_decomposition/tests/test_pls.py @@ -12,7 +12,9 @@ _svd_flip_1d, ) from sklearn.datasets import load_linnerud, make_regression +from sklearn.ensemble import VotingRegressor from sklearn.exceptions import ConvergenceWarning +from sklearn.linear_model import LinearRegression from sklearn.utils import check_random_state from sklearn.utils.extmath import svd_flip @@ -621,3 +623,24 @@ def test_pls_set_output(Klass): assert isinstance(y_trans, np.ndarray) assert isinstance(X_trans, pd.DataFrame) assert_array_equal(X_trans.columns, est.get_feature_names_out()) + + +def test_pls_regression_fit_1d_y(): + """Check that when fitting with 1d `y`, prediction should also be 1d. + + Non-regression test for Issue #26549. + """ + X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]]) + y = np.array([2, 6, 12, 20, 30, 42]) + expected = y.copy() + + plsr = PLSRegression().fit(X, y) + y_pred = plsr.predict(X) + assert y_pred.shape == expected.shape + + # Check that it works in VotingRegressor + lr = LinearRegression().fit(X, y) + vr = VotingRegressor([("lr", lr), ("plsr", plsr)]) + y_pred = vr.fit(X, y).predict(X) + assert y_pred.shape == expected.shape + assert_allclose(y_pred, expected) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index f4a574c62c5e9..7348044e0d8fa 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -66,6 +66,7 @@ class calls the ``fit`` method of each sub-estimator on random samples from sklearn.utils import check_random_state, compute_sample_weight from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions +from sklearn.utils._tags import _safe_tags from sklearn.utils.multiclass import check_classification_targets, type_of_target from sklearn.utils.parallel import Parallel, delayed from sklearn.utils.validation import ( @@ -163,6 +164,7 @@ def _parallel_build_trees( verbose=0, class_weight=None, n_samples_bootstrap=None, + missing_values_in_feature_mask=None, ): """ Private function used to fit a single tree in parallel.""" @@ -189,9 +191,21 @@ def _parallel_build_trees( elif class_weight == "balanced_subsample": curr_sample_weight *= compute_sample_weight("balanced", y, indices=indices) - tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False) + tree._fit( + X, + y, + sample_weight=curr_sample_weight, + check_input=False, + missing_values_in_feature_mask=missing_values_in_feature_mask, + ) else: - tree.fit(X, y, sample_weight=sample_weight, check_input=False) + tree._fit( + X, + y, + sample_weight=sample_weight, + check_input=False, + missing_values_in_feature_mask=missing_values_in_feature_mask, + ) return tree @@ -367,9 +381,26 @@ def fit(self, X, y, sample_weight=None): # Validate or convert input data if issparse(y): raise ValueError("sparse multilabel-indicator for y is not supported.") + X, y = self._validate_data( - X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE + X, + y, + multi_output=True, + accept_sparse="csc", + dtype=DTYPE, + force_all_finite=False, + ) + # _compute_missing_values_in_feature_mask checks if X has missing values and + # will raise an error if the underlying tree base estimator can't handle missing + # values. Only the criterion is required to determine if the tree supports + # missing values. + estimator = type(self.estimator)(criterion=self.criterion) + missing_values_in_feature_mask = ( + estimator._compute_missing_values_in_feature_mask( + X, estimator_name=self.__class__.__name__ + ) ) + if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) @@ -523,6 +554,7 @@ def fit(self, X, y, sample_weight=None): verbose=self.verbose, class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap, + missing_values_in_feature_mask=missing_values_in_feature_mask, ) for i, t in enumerate(trees) ) @@ -650,7 +682,18 @@ def _validate_X_predict(self, X): """ Validate X whenever one tries to predict, apply, predict_proba.""" check_is_fitted(self) - X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False) + if self.estimators_[0]._support_missing_values(X): + force_all_finite = "allow-nan" + else: + force_all_finite = True + + X = self._validate_data( + X, + dtype=DTYPE, + accept_sparse="csr", + reset=False, + force_all_finite=force_all_finite, + ) if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): raise ValueError("No support for np.int64 index based sparse matrices") return X @@ -858,6 +901,12 @@ def get_leaf_node_samples(self, X): leaf_nodes_samples[i] = np.vstack((leaf_nodes_samples[i], node_samples)) return leaf_nodes_samples + def _more_tags(self): + # Only the criterion is required to determine if the tree supports + # missing values + estimator = type(self.estimator)(criterion=self.criterion) + return {"allow_nan": _safe_tags(estimator, key="allow_nan")} + def _accumulate_prediction(predict, X, out, lock): """ diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 3a14da52047ad..f1e7b7d6e063a 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -243,13 +243,14 @@ def _fit_stage( # no inplace multiplication! sample_weight = sample_weight * sample_mask.astype(np.float64) - X = X_csr if X_csr is not None else X + X = X_csc if X_csc is not None else X tree.fit(X, residual, sample_weight=sample_weight, check_input=False) # update tree leaves + X_for_tree_update = X_csr if X_csr is not None else X loss.update_terminal_regions( tree.tree_, - X, + X_for_tree_update, y, residual, raw_predictions, @@ -434,16 +435,18 @@ def fit(self, X, y, sample_weight=None, monitor=None): if self.n_iter_no_change is not None: stratify = y if is_classifier(self) else None - X, X_val, y, y_val, sample_weight, sample_weight_val = train_test_split( - X, - y, - sample_weight, - random_state=self.random_state, - test_size=self.validation_fraction, - stratify=stratify, + X_train, X_val, y_train, y_val, sample_weight_train, sample_weight_val = ( + train_test_split( + X, + y, + sample_weight, + random_state=self.random_state, + test_size=self.validation_fraction, + stratify=stratify, + ) ) if is_classifier(self): - if self._n_classes != np.unique(y).shape[0]: + if self._n_classes != np.unique(y_train).shape[0]: # We choose to error here. The problem is that the init # estimator would be trained on y, which has some missing # classes now, so its predictions would not have the @@ -454,6 +457,7 @@ def fit(self, X, y, sample_weight=None, monitor=None): "seed." ) else: + X_train, y_train, sample_weight_train = X, y, sample_weight X_val = y_val = sample_weight_val = None if not self._is_initialized(): @@ -463,19 +467,21 @@ def fit(self, X, y, sample_weight=None, monitor=None): # fit initial model and initialize raw predictions if self.init_ == "zero": raw_predictions = np.zeros( - shape=(X.shape[0], self._loss.K), dtype=np.float64 + shape=(X_train.shape[0], self._loss.K), dtype=np.float64 ) else: # XXX clean this once we have a support_sample_weight tag if sample_weight_is_none: - self.init_.fit(X, y) + self.init_.fit(X_train, y_train) else: msg = ( "The initial estimator {} does not support sample " "weights.".format(self.init_.__class__.__name__) ) try: - self.init_.fit(X, y, sample_weight=sample_weight) + self.init_.fit( + X_train, y_train, sample_weight=sample_weight_train + ) except TypeError as e: if "unexpected keyword argument 'sample_weight'" in str(e): # regular estimator without SW support @@ -493,7 +499,9 @@ def fit(self, X, y, sample_weight=None, monitor=None): else: # regular estimator whose input checking failed raise - raw_predictions = self._loss.get_init_raw_predictions(X, self.init_) + raw_predictions = self._loss.get_init_raw_predictions( + X_train, self.init_ + ) begin_at_stage = 0 @@ -513,22 +521,22 @@ def fit(self, X, y, sample_weight=None, monitor=None): # The requirements of _raw_predict # are more constrained than fit. It accepts only CSR # matrices. Finite values have already been checked in _validate_data. - X = check_array( - X, + X_train = check_array( + X_train, dtype=DTYPE, order="C", accept_sparse="csr", force_all_finite=False, ) - raw_predictions = self._raw_predict(X) + raw_predictions = self._raw_predict(X_train) self._resize_state() # fit the boosting stages n_stages = self._fit_stages( - X, - y, + X_train, + y_train, raw_predictions, - sample_weight, + sample_weight_train, self._rng, X_val, y_val, diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 5d030d3add5bb..c3af930654b73 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -59,13 +59,23 @@ def _update_leaves_values(loss, grower, y_true, raw_prediction, sample_weight): Update equals: loss.fit_intercept_only(y_true - raw_prediction) - This is only applied if loss.need_update_leaves_values is True. + This is only applied if loss.differentiable is False. Note: It only works, if the loss is a function of the residual, as is the case for AbsoluteError and PinballLoss. Otherwise, one would need to get the minimum of loss(y_true, raw_prediction + x) in x. A few examples: - AbsoluteError: median(y_true - raw_prediction). - PinballLoss: quantile(y_true - raw_prediction). - See also notes about need_update_leaves_values in BaseLoss. + + More background: + For the standard gradient descent method according to "Greedy Function + Approximation: A Gradient Boosting Machine" by Friedman, all loss functions but the + squared loss need a line search step. BaseHistGradientBoosting, however, implements + a so called Newton boosting where the trees are fitted to a 2nd order + approximations of the loss in terms of gradients and hessians. In this case, the + line search step is only necessary if the loss is not smooth, i.e. not + differentiable, which renders the 2nd order approximation invalid. In fact, + non-smooth losses arbitrarily set hessians to 1 and effectively use the standard + gradient descent method with line search. """ # TODO: Ideally this should be computed in parallel over the leaves using something # similar to _update_raw_predictions(), but this requires a cython version of @@ -699,7 +709,7 @@ def fit(self, X, y, sample_weight=None): acc_find_split_time += grower.total_find_split_time acc_compute_hist_time += grower.total_compute_hist_time - if self._loss.need_update_leaves_values: + if not self._loss.differentiable: _update_leaves_values( loss=self._loss, grower=grower, diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 2129e4d9a0134..539d97fbf345e 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -254,7 +254,7 @@ def fit(self, X, y, sample_weight=None): cv=deepcopy(cv), method=meth, n_jobs=self.n_jobs, - fit_params=fit_params, + params=fit_params, verbose=self.verbose, ) for est, meth in zip(all_estimators, self.stack_method_) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 9291b6982a923..efc5d7d5ee5a4 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -2031,3 +2031,94 @@ def test_multioutput_quantiles(name): assert len(leaf_nodes_samples) == len(X_test) for node_samples in leaf_nodes_samples: assert node_samples.shape[1] == est.n_outputs_ + + +@pytest.mark.parametrize( + "make_data, Forest", + [ + (datasets.make_regression, RandomForestRegressor), + (datasets.make_classification, RandomForestClassifier), + ], +) +def test_missing_values_is_resilient(make_data, Forest): + """Check that forest can deal with missing values and has decent performance.""" + + rng = np.random.RandomState(0) + n_samples, n_features = 1000, 10 + X, y = make_data(n_samples=n_samples, n_features=n_features, random_state=rng) + + # Create dataset with missing values + X_missing = X.copy() + X_missing[rng.choice([False, True], size=X.shape, p=[0.95, 0.05])] = np.nan + assert np.isnan(X_missing).any() + + X_missing_train, X_missing_test, y_train, y_test = train_test_split( + X_missing, y, random_state=0 + ) + + # Train forest with missing values + forest_with_missing = Forest(random_state=rng, n_estimators=50) + forest_with_missing.fit(X_missing_train, y_train) + score_with_missing = forest_with_missing.score(X_missing_test, y_test) + + # Train forest without missing values + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + forest = Forest(random_state=rng, n_estimators=50) + forest.fit(X_train, y_train) + score_without_missing = forest.score(X_test, y_test) + + # Score is still 80 percent of the forest's score that had no missing values + assert score_with_missing >= 0.80 * score_without_missing + + +@pytest.mark.parametrize("Forest", [RandomForestClassifier, RandomForestRegressor]) +def test_missing_value_is_predictive(Forest): + """Check that the forest learns when missing values are only present for + a predictive feature.""" + rng = np.random.RandomState(0) + n_samples = 300 + + X_non_predictive = rng.standard_normal(size=(n_samples, 10)) + y = rng.randint(0, high=2, size=n_samples) + + # Create a predictive feature using `y` and with some noise + X_random_mask = rng.choice([False, True], size=n_samples, p=[0.95, 0.05]) + y_mask = y.astype(bool) + y_mask[X_random_mask] = ~y_mask[X_random_mask] + + predictive_feature = rng.standard_normal(size=n_samples) + predictive_feature[y_mask] = np.nan + assert np.isnan(predictive_feature).any() + + X_predictive = X_non_predictive.copy() + X_predictive[:, 5] = predictive_feature + + ( + X_predictive_train, + X_predictive_test, + X_non_predictive_train, + X_non_predictive_test, + y_train, + y_test, + ) = train_test_split(X_predictive, X_non_predictive, y, random_state=0) + forest_predictive = Forest(random_state=0).fit(X_predictive_train, y_train) + forest_non_predictive = Forest(random_state=0).fit(X_non_predictive_train, y_train) + + predictive_test_score = forest_predictive.score(X_predictive_test, y_test) + + assert predictive_test_score >= 0.75 + assert predictive_test_score >= forest_non_predictive.score( + X_non_predictive_test, y_test + ) + + +def test_non_supported_criterion_raises_error_with_missing_values(): + """Raise error for unsupported criterion when there are missing values.""" + X = np.array([[0, 1, 2], [np.nan, 0, 2.0]]) + y = [0.5, 1.0] + + forest = RandomForestRegressor(criterion="absolute_error") + + msg = "RandomForestRegressor does not accept missing values" + with pytest.raises(ValueError, match=msg): + forest.fit(X, y) diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 11cf083992653..b3dffa5494b0d 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -33,7 +33,12 @@ def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer): X_train, y_train, lambda estimator, features: _score( - estimator, X_test[:, features], y_test, scorer + # TODO(SLEP6): pass score_params here + estimator, + X_test[:, features], + y_test, + scorer, + score_params=None, ), ).scores_ diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index db0da278b39ef..e36b49f262b2d 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -282,7 +282,12 @@ def transform(self, X): Xc[:, ~valid_mask] = 0 else: Xc = X[:, valid_mask] - return Xc + + # Even if there are no missing values in X, we still concatenate Xc + # with the missing value indicator matrix, X_indicator. + # This is to ensure that the output maintains consistency in terms + # of columns, regardless of whether missing values exist in X or not. + return super()._concatenate_indicator(Xc, X_indicator) row_missing_idx = np.flatnonzero(mask.any(axis=1)) diff --git a/sklearn/impute/tests/test_common.py b/sklearn/impute/tests/test_common.py index aad7eb12a0a92..be2fa6e4d1736 100644 --- a/sklearn/impute/tests/test_common.py +++ b/sklearn/impute/tests/test_common.py @@ -181,3 +181,39 @@ def test_keep_empty_features(imputer, keep_empty_features): assert X_imputed.shape == X.shape else: assert X_imputed.shape == (X.shape[0], X.shape[1] - 1) + + +@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__) +@pytest.mark.parametrize("missing_value_test", [np.nan, 1]) +def test_imputation_adds_missing_indicator_if_add_indicator_is_true( + imputer, missing_value_test +): + """Check that missing indicator always exists when add_indicator=True. + + Non-regression test for gh-26590. + """ + X_train = np.array([[0, np.NaN], [1, 2]]) + + # Test data where missing_value_test variable can be set to np.NaN or 1. + X_test = np.array([[0, missing_value_test], [1, 2]]) + + imputer.set_params(add_indicator=True) + imputer.fit(X_train) + + X_test_imputed_with_indicator = imputer.transform(X_test) + assert X_test_imputed_with_indicator.shape == (2, 3) + + imputer.set_params(add_indicator=False) + imputer.fit(X_train) + X_test_imputed_without_indicator = imputer.transform(X_test) + assert X_test_imputed_without_indicator.shape == (2, 2) + + assert_allclose( + X_test_imputed_with_indicator[:, :-1], X_test_imputed_without_indicator + ) + if np.isnan(missing_value_test): + expected_missing_indicator = [1, 0] + else: + expected_missing_indicator = [0, 0] + + assert_allclose(X_test_imputed_with_indicator[:, -1], expected_missing_indicator) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 1a9bc7216a0b5..a76fd98940ad4 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -856,8 +856,9 @@ class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator): in 1.4. Use `None` instead. dual : bool, default=False - Dual or primal formulation. Dual formulation is only implemented for - l2 penalty with liblinear solver. Prefer dual=False when + Dual (constrained) or primal (regularized, see also + :ref:`this equation `) formulation. Dual formulation + is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features. tol : float, default=1e-4 @@ -1474,8 +1475,9 @@ class LogisticRegressionCV(LogisticRegression, LinearClassifierMixin, BaseEstima ``cv`` default value if None changed from 3-fold to 5-fold. dual : bool, default=False - Dual or primal formulation. Dual formulation is only implemented for - l2 penalty with liblinear solver. Prefer dual=False when + Dual (constrained) or primal (regularized, see also + :ref:`this equation `) formulation. Dual formulation + is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features. penalty : {'l1', 'l2', 'elasticnet'}, default='l2' @@ -1857,10 +1859,10 @@ def fit(self, X, y, sample_weight=None, **params): if _routing_enabled(): routed_params = process_routing( - obj=self, - method="fit", + self, + "fit", sample_weight=sample_weight, - other_params=params, + **params, ) else: routed_params = Bunch() @@ -2148,10 +2150,10 @@ def score(self, X, y, sample_weight=None, **score_params): scoring = self._get_scorer() if _routing_enabled(): routed_params = process_routing( - obj=self, - method="score", + self, + "score", sample_weight=sample_weight, - other_params=score_params, + **score_params, ) else: routed_params = Bunch() diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index 60b8da3ecfa46..313225088c776 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -71,26 +71,26 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): cdef object func cdef object kwargs - cdef float64_t dist( + cdef {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, intp_t size, ) except -1 nogil - cdef float64_t rdist( + cdef {{INPUT_DTYPE_t}} rdist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, intp_t size, ) except -1 nogil - cdef float64_t dist_csr( + cdef {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -98,12 +98,12 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): const intp_t size, ) except -1 nogil - cdef float64_t rdist_csr( + cdef {{INPUT_DTYPE_t}} rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -114,39 +114,39 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): cdef int pdist( self, const {{INPUT_DTYPE_t}}[:, ::1] X, - float64_t[:, ::1] D, + {{INPUT_DTYPE_t}}[:, ::1] D, ) except -1 cdef int cdist( self, const {{INPUT_DTYPE_t}}[:, ::1] X, const {{INPUT_DTYPE_t}}[:, ::1] Y, - float64_t[:, ::1] D, + {{INPUT_DTYPE_t}}[:, ::1] D, ) except -1 cdef int pdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, - const int32_t[:] x1_indptr, + const int32_t[::1] x1_indices, + const int32_t[::1] x1_indptr, const intp_t size, - float64_t[:, ::1] D, + {{INPUT_DTYPE_t}}[:, ::1] D, ) except -1 nogil cdef int cdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, - const int32_t[:] x1_indptr, + const int32_t[::1] x1_indices, + const int32_t[::1] x1_indptr, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, - const int32_t[:] x2_indptr, + const int32_t[::1] x2_indices, + const int32_t[::1] x2_indptr, const intp_t size, - float64_t[:, ::1] D, + {{INPUT_DTYPE_t}}[:, ::1] D, ) except -1 nogil - cdef float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil + cdef {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil - cdef float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil + cdef {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil {{endfor}} diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index bc54e51a7511a..6b5ea300f038b 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -65,6 +65,118 @@ def get_valid_metric_ids(L): if (val.__name__ in L) or (val in L)] cdef class DistanceMetric: + """Uniform interface for fast distance metric functions. + + The `DistanceMetric` class provides a convenient way to compute pairwise distances + between samples. It supports various distance metrics, such as Euclidean distance, + Manhattan distance, and more. + + The `pairwise` method can be used to compute pairwise distances between samples in + the input arrays. It returns a distance matrix representing the distances between + all pairs of samples. + + The :meth:`get_metric` method allows you to retrieve a specific metric using its + string identifier. + + Examples + -------- + >>> from sklearn.metrics import DistanceMetric + >>> dist = DistanceMetric.get_metric('euclidean') + >>> X = [[1, 2], [3, 4], [5, 6]] + >>> Y = [[7, 8], [9, 10]] + >>> dist.pairwise(X,Y) + array([[7.81..., 10.63...] + [5.65..., 8.48...] + [1.41..., 4.24...]]) + + Available Metrics + + The following lists the string metric identifiers and the associated + distance metric classes: + + **Metrics intended for real-valued vector spaces:** + + ============== ==================== ======== =============================== + identifier class name args distance function + -------------- -------------------- -------- ------------------------------- + "euclidean" EuclideanDistance - ``sqrt(sum((x - y)^2))`` + "manhattan" ManhattanDistance - ``sum(|x - y|)`` + "chebyshev" ChebyshevDistance - ``max(|x - y|)`` + "minkowski" MinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)`` + "seuclidean" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))`` + "mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))`` + ============== ==================== ======== =============================== + + **Metrics intended for two-dimensional vector spaces:** Note that the haversine + distance metric requires data in the form of [latitude, longitude] and both + inputs and outputs are in units of radians. + + ============ ================== =============================================================== + identifier class name distance function + ------------ ------------------ --------------------------------------------------------------- + "haversine" HaversineDistance ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))`` + ============ ================== =============================================================== + + + **Metrics intended for integer-valued vector spaces:** Though intended + for integer-valued vectors, these are also valid metrics in the case of + real-valued vectors. + + ============= ==================== ======================================== + identifier class name distance function + ------------- -------------------- ---------------------------------------- + "hamming" HammingDistance ``N_unequal(x, y) / N_tot`` + "canberra" CanberraDistance ``sum(|x - y| / (|x| + |y|))`` + "braycurtis" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))`` + ============= ==================== ======================================== + + **Metrics intended for boolean-valued vector spaces:** Any nonzero entry + is evaluated to "True". In the listings below, the following + abbreviations are used: + + - N : number of dimensions + - NTT : number of dims in which both values are True + - NTF : number of dims in which the first value is True, second is False + - NFT : number of dims in which the first value is False, second is True + - NFF : number of dims in which both values are False + - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT + - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT + + ================= ======================= =============================== + identifier class name distance function + ----------------- ----------------------- ------------------------------- + "jaccard" JaccardDistance NNEQ / NNZ + "matching" MatchingDistance NNEQ / N + "dice" DiceDistance NNEQ / (NTT + NNZ) + "kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N) + "rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ) + "russellrao" RussellRaoDistance (N - NTT) / N + "sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ) + "sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT) + ================= ======================= =============================== + + **User-defined distance:** + + =========== =============== ======= + identifier class name args + ----------- --------------- ------- + "pyfunc" PyFuncDistance func + =========== =============== ======= + + Here ``func`` is a function which takes two one-dimensional numpy + arrays, and returns a distance. Note that in order to be used within + the BallTree, the distance must be a true metric: + i.e. it must satisfy the following properties + + 1) Non-negativity: d(x, y) >= 0 + 2) Identity: d(x, y) = 0 if and only if x == y + 3) Symmetry: d(x, y) = d(y, x) + 4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z) + + Because of the Python object overhead involved in calling the python + function, this will be fairly slow, but it will have the same + scaling as other distances. + """ @classmethod def get_metric(cls, metric, dtype=np.float64, **kwargs): """Get the given distance metric from the string identifier. @@ -74,11 +186,24 @@ cdef class DistanceMetric: Parameters ---------- metric : str or class name - The distance metric to use + The string identifier or class name of the desired distance metric. + See the documentation of the `DistanceMetric` class for a list of + available metrics. + dtype : {np.float32, np.float64}, default=np.float64 - The dtype of the data on which the metric will be applied + The data type of the input on which the metric will be applied. + This affects the precision of the computed distances. + By default, it is set to `np.float64`. + **kwargs - additional arguments will be passed to the requested metric + Additional keyword arguments that will be passed to the requested metric. + These arguments can be used to customize the behavior of the specific + metric. + + Returns + ------- + metric_obj : instance of the requested metric + An instance of the requested distance metric class. """ if dtype == np.float32: specialized_class = DistanceMetric32 @@ -332,7 +457,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): """ return - cdef float64_t dist( + cdef {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -344,7 +469,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): """ return -999 - cdef float64_t rdist( + cdef {{INPUT_DTYPE_t}} rdist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -364,7 +489,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): cdef int pdist( self, const {{INPUT_DTYPE_t}}[:, ::1] X, - float64_t[:, ::1] D, + {{INPUT_DTYPE_t}}[:, ::1] D, ) except -1: """Compute the pairwise distances between points in X""" cdef intp_t i1, i2 @@ -379,7 +504,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): self, const {{INPUT_DTYPE_t}}[:, ::1] X, const {{INPUT_DTYPE_t}}[:, ::1] Y, - float64_t[:, ::1] D, + {{INPUT_DTYPE_t}}[:, ::1] D, ) except -1: """Compute the cross-pairwise distances between arrays X and Y""" cdef intp_t i1, i2 @@ -390,12 +515,12 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1]) return 0 - cdef float64_t dist_csr( + cdef {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -420,12 +545,12 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): 2. An alternative signature would be: - cdef float64_t dist_csr( + cdef {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, ) except -1 nogil: Where callers would use slicing on the original CSR data and indices @@ -456,12 +581,12 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): """ return -999 - cdef float64_t rdist_csr( + cdef {{INPUT_DTYPE_t}} rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -500,10 +625,10 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): cdef int pdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, - const int32_t[:] x1_indptr, + const int32_t[::1] x1_indices, + const int32_t[::1] x1_indptr, const intp_t size, - float64_t[:, ::1] D, + {{INPUT_DTYPE_t}}[:, ::1] D, ) except -1 nogil: """Pairwise distances between rows in CSR matrix X. @@ -523,9 +648,9 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): x2_end = x1_indptr[i2 + 1] D[i1, i2] = D[i2, i1] = self.dist_csr( x1_data, - x1_indices, + &x1_indices[0], x1_data, - x1_indices, + &x1_indices[0], x1_start, x1_end, x2_start, @@ -537,13 +662,13 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): cdef int cdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, - const int32_t[:] x1_indptr, + const int32_t[::1] x1_indices, + const int32_t[::1] x1_indptr, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, - const int32_t[:] x2_indptr, + const int32_t[::1] x2_indices, + const int32_t[::1] x2_indptr, const intp_t size, - float64_t[:, ::1] D, + {{INPUT_DTYPE_t}}[:, ::1] D, ) except -1 nogil: """Compute the cross-pairwise distances between arrays X and Y represented in the CSR format.""" @@ -562,9 +687,9 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): D[i1, i2] = self.dist_csr( x1_data, - x1_indices, + &x1_indices[0], x2_data, - x2_indices, + &x2_indices[0], x1_start, x1_end, x2_start, @@ -573,11 +698,11 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): ) return 0 - cdef float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + cdef {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: """Convert the rank-preserving surrogate distance to the distance""" return rdist - cdef float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + cdef {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: """Convert the distance to the rank-preserving surrogate distance""" return dist @@ -624,33 +749,33 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): def _pairwise_dense_dense(self, X, Y): cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr cdef const {{INPUT_DTYPE_t}}[:, ::1] Yarr - cdef float64_t[:, ::1] Darr + cdef {{INPUT_DTYPE_t}}[:, ::1] Darr Xarr = np.asarray(X, dtype={{INPUT_DTYPE}}, order='C') self._validate_data(Xarr) if X is Y: - Darr = np.empty((Xarr.shape[0], Xarr.shape[0]), dtype=np.float64, order='C') + Darr = np.empty((Xarr.shape[0], Xarr.shape[0]), dtype={{INPUT_DTYPE}}, order='C') self.pdist(Xarr, Darr) else: Yarr = np.asarray(Y, dtype={{INPUT_DTYPE}}, order='C') self._validate_data(Yarr) - Darr = np.empty((Xarr.shape[0], Yarr.shape[0]), dtype=np.float64, order='C') + Darr = np.empty((Xarr.shape[0], Yarr.shape[0]), dtype={{INPUT_DTYPE}}, order='C') self.cdist(Xarr, Yarr, Darr) return np.asarray(Darr) def _pairwise_sparse_sparse(self, X: csr_matrix , Y: csr_matrix): cdef: intp_t n_X, n_features - const {{INPUT_DTYPE_t}}[:] X_data - const int32_t[:] X_indices - const int32_t[:] X_indptr + const {{INPUT_DTYPE_t}}[::1] X_data + const int32_t[::1] X_indices + const int32_t[::1] X_indptr intp_t n_Y - const {{INPUT_DTYPE_t}}[:] Y_data - const int32_t[:] Y_indices - const int32_t[:] Y_indptr + const {{INPUT_DTYPE_t}}[::1] Y_data + const int32_t[::1] Y_indices + const int32_t[::1] Y_indptr - float64_t[:, ::1] Darr + {{INPUT_DTYPE_t}}[:, ::1] Darr X_csr = X.tocsr() n_X, n_features = X_csr.shape @@ -658,7 +783,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): X_indices = np.asarray(X_csr.indices, dtype=np.int32) X_indptr = np.asarray(X_csr.indptr, dtype=np.int32) if X is Y: - Darr = np.empty((n_X, n_X), dtype=np.float64, order='C') + Darr = np.empty((n_X, n_X), dtype={{INPUT_DTYPE}}, order='C') self.pdist_csr( x1_data=&X_data[0], x1_indices=X_indices, @@ -673,7 +798,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): Y_indices = np.asarray(Y_csr.indices, dtype=np.int32) Y_indptr = np.asarray(Y_csr.indptr, dtype=np.int32) - Darr = np.empty((n_X, n_Y), dtype=np.float64, order='C') + Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C') self.cdist_csr( x1_data=&X_data[0], x1_indices=X_indices, @@ -690,13 +815,13 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): cdef: intp_t n_X = X.shape[0] intp_t n_features = X.shape[1] - const {{INPUT_DTYPE_t}}[:] X_data = np.asarray( + const {{INPUT_DTYPE_t}}[::1] X_data = np.asarray( X.data, dtype={{INPUT_DTYPE}}, ) - const int32_t[:] X_indices = np.asarray( + const int32_t[::1] X_indices = np.asarray( X.indices, dtype=np.int32, ) - const int32_t[:] X_indptr = np.asarray( + const int32_t[::1] X_indptr = np.asarray( X.indptr, dtype=np.int32, ) @@ -704,11 +829,11 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): Y, dtype={{INPUT_DTYPE}}, order="C", ) intp_t n_Y = Y_data.shape[0] - const int32_t[:] Y_indices = ( + const int32_t[::1] Y_indices = ( np.arange(n_features, dtype=np.int32) ) - float64_t[:, ::1] Darr = np.empty((n_X, n_Y), dtype=np.float64, order='C') + {{INPUT_DTYPE_t}}[:, ::1] Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C') intp_t i1, i2 intp_t x1_start, x1_end @@ -735,9 +860,9 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): Darr[i1, i2] = self.dist_csr( x1_data=&X_data[0], - x1_indices=X_indices, + x1_indices=&X_indices[0], x2_data=x2_data, - x2_indices=Y_indices, + x2_indices=&Y_indices[0], x1_start=x1_start, x1_end=x1_end, x2_start=0, @@ -758,22 +883,22 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): const {{INPUT_DTYPE_t}}[:, ::1] X_data = np.asarray( X, dtype={{INPUT_DTYPE}}, order="C", ) - const int32_t[:] X_indices = np.arange( + const int32_t[::1] X_indices = np.arange( n_features, dtype=np.int32, ) intp_t n_Y = Y.shape[0] - const {{INPUT_DTYPE_t}}[:] Y_data = np.asarray( + const {{INPUT_DTYPE_t}}[::1] Y_data = np.asarray( Y.data, dtype={{INPUT_DTYPE}}, ) - const int32_t[:] Y_indices = np.asarray( + const int32_t[::1] Y_indices = np.asarray( Y.indices, dtype=np.int32, ) - const int32_t[:] Y_indptr = np.asarray( + const int32_t[::1] Y_indptr = np.asarray( Y.indptr, dtype=np.int32, ) - float64_t[:, ::1] Darr = np.empty((n_X, n_Y), dtype=np.float64, order='C') + {{INPUT_DTYPE_t}}[:, ::1] Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C') intp_t i1, i2 {{INPUT_DTYPE_t}} * x1_data @@ -801,9 +926,9 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): Darr[i1, i2] = self.dist_csr( x1_data=x1_data, - x1_indices=X_indices, + x1_indices=&X_indices[0], x2_data=&Y_data[0], - x2_indices=Y_indices, + x2_indices=&Y_indices[0], x1_start=0, x1_end=n_features, x2_start=x2_start, @@ -867,24 +992,24 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def __init__(self): self.p = 2 - cdef inline float64_t dist(self, + cdef inline {{INPUT_DTYPE_t}} dist(self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, intp_t size, ) except -1 nogil: return euclidean_dist{{name_suffix}}(x1, x2, size) - cdef inline float64_t rdist(self, + cdef inline {{INPUT_DTYPE_t}} rdist(self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, intp_t size, ) except -1 nogil: return euclidean_rdist{{name_suffix}}(x1, x2, size) - cdef inline float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: return sqrt(rdist) - cdef inline float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: return dist * dist def rdist_to_dist(self, rdist): @@ -893,12 +1018,12 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 - cdef inline float64_t rdist_csr( + cdef inline {{INPUT_DTYPE_t}} rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -945,12 +1070,12 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -988,7 +1113,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if X.shape[1] != self.size: raise ValueError('SEuclidean dist: size of V does not match') - cdef inline float64_t rdist( + cdef inline {{INPUT_DTYPE_t}} rdist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1001,7 +1126,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += (tmp * tmp / self.vec[j]) return d - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1009,10 +1134,10 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ) except -1 nogil: return sqrt(self.rdist(x1, x2, size)) - cdef inline float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: return sqrt(rdist) - cdef inline float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: return dist * dist def rdist_to_dist(self, rdist): @@ -1021,12 +1146,12 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 - cdef inline float64_t rdist_csr( + cdef inline {{INPUT_DTYPE_t}} rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1074,12 +1199,12 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 return d - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1111,7 +1236,7 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def __init__(self): self.p = 1 - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1123,12 +1248,12 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += fabs(x1[j] - x2[j]) return d - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1141,7 +1266,7 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): intp_t i1 = x1_start intp_t i2 = x2_start - float64_t d = 0.0 + {{INPUT_DTYPE_t}} d = 0.0 while i1 < x1_end and i2 < x2_end: ix1 = x1_indices[i1] @@ -1194,7 +1319,7 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def __init__(self): self.p = INF{{name_suffix}} - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1207,12 +1332,12 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1271,19 +1396,27 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): Parameters ---------- - p : int + p : float The order of the p-norm of the difference (see above). + + .. versionchanged:: 1.4.0 + Minkowski distance allows `p` to be `0= 1 and finite. For p = infinity, - use ChebyshevDistance. + Minkowski Distance requires p > 0 and finite. + When :math:`p \in (0,1)`, it isn't a true metric but is permissible when + the triangular inequality isn't necessary. + For p = infinity, use ChebyshevDistance. Note that for p=1, ManhattanDistance is more efficient, and for p=2, EuclideanDistance is more efficient. + """ def __init__(self, p, w=None): - if p < 1: - raise ValueError("p must be greater than 1") + if p <= 0: + raise ValueError("p must be greater than 0") elif np.isinf(p): raise ValueError("MinkowskiDistance requires finite p. " "For p=inf, use ChebyshevDistance.") @@ -1307,7 +1440,7 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): f"the number of features ({X.shape[1]}). " f"Currently len(w)={self.size}.") - cdef inline float64_t rdist( + cdef inline {{INPUT_DTYPE_t}} rdist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1324,7 +1457,7 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += (pow(fabs(x1[j] - x2[j]), self.p)) return d - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1332,10 +1465,10 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ) except -1 nogil: return pow(self.rdist(x1, x2, size), 1. / self.p) - cdef inline float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: return pow(rdist, 1. / self.p) - cdef inline float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: return pow(dist, self.p) def rdist_to_dist(self, rdist): @@ -1344,12 +1477,12 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** self.p - cdef inline float64_t rdist_csr( + cdef inline {{INPUT_DTYPE_t}} rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1424,12 +1557,12 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1496,7 +1629,7 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if X.shape[1] != self.size: raise ValueError('Mahalanobis dist: size of V does not match') - cdef inline float64_t rdist( + cdef inline {{INPUT_DTYPE_t}} rdist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1516,7 +1649,7 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += tmp * self.buffer[i] return d - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1524,10 +1657,10 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ) except -1 nogil: return sqrt(self.rdist(x1, x2, size)) - cdef inline float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: return sqrt(rdist) - cdef inline float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: return dist * dist def rdist_to_dist(self, rdist): @@ -1536,12 +1669,12 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 - cdef inline float64_t rdist_csr( + cdef inline {{INPUT_DTYPE_t}} rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1590,12 +1723,12 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1627,7 +1760,7 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i} """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1641,12 +1774,12 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return float(n_unequal) / size - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1702,7 +1835,7 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|} """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1716,12 +1849,12 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += fabs(x1[j] - x2[j]) / denom return d - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1777,7 +1910,7 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)} """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1793,12 +1926,12 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): else: return 0.0 - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1857,7 +1990,7 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (N_TT + N_TF + N_FT) """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1877,12 +2010,12 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return 0 return (nnz - n_eq) * 1.0 / nnz - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1946,7 +2079,7 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / N """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1960,12 +2093,12 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return n_neq * 1. / size - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -2021,7 +2154,7 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (2 * N_TT + N_TF + N_FT) """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -2036,12 +2169,12 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return n_neq / (2.0 * n_tt + n_neq) - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -2102,7 +2235,7 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 1 - N_TT / (N + N_TF + N_FT) """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -2117,12 +2250,12 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_tt += (tf1 and tf2) return (n_neq - n_tt + size) * 1.0 / (n_neq + size) - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -2181,7 +2314,7 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT) """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -2195,12 +2328,12 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return (2.0 * n_neq) / (size + n_neq) - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -2258,7 +2391,7 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N - N_TT) / N """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -2272,12 +2405,12 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_tt += (tf1 and tf2) return (size - n_tt) * 1. / size - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -2328,7 +2461,7 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT) """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -2342,12 +2475,12 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return (2.0 * n_neq) / (size + n_neq) - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -2405,7 +2538,7 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (N_TT / 2 + N_FT + N_TF) """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -2420,12 +2553,12 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_tt += (tf1 and tf2) return n_neq / (0.5 * n_tt + n_neq) - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -2494,7 +2627,7 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): raise ValueError("Haversine distance only valid " "in 2 dimensions") - cdef inline float64_t rdist(self, + cdef inline {{INPUT_DTYPE_t}} rdist(self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, intp_t size, @@ -2503,17 +2636,17 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef float64_t sin_1 = sin(0.5 * ((x1[1]) - (x2[1]))) return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) - cdef inline float64_t dist(self, + cdef inline {{INPUT_DTYPE_t}} dist(self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, intp_t size, ) except -1 nogil: return 2 * asin(sqrt(self.rdist(x1, x2, size))) - cdef inline float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: return 2 * asin(sqrt(rdist)) - cdef inline float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: cdef float64_t tmp = sin(0.5 * dist) return tmp * tmp @@ -2524,17 +2657,17 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): tmp = np.sin(0.5 * dist) return tmp * tmp - cdef inline float64_t dist_csr( - self, - const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, - const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, - const int32_t x1_start, - const int32_t x1_end, - const int32_t x2_start, - const int32_t x2_end, - const intp_t size, + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, ) except -1 nogil: return 2 * asin(sqrt(self.rdist_csr( x1_data, @@ -2548,12 +2681,12 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): size, ))) - cdef inline float64_t rdist_csr( + cdef inline {{INPUT_DTYPE_t}} rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -2640,7 +2773,7 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The # only way to be back compatible is to inherit `dist` from the base class # without GIL and called an inline `_dist` which acquire GIL. - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -2648,7 +2781,7 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ) except -1 nogil: return self._dist(x1, x2, size) - cdef inline float64_t _dist( + cdef inline {{INPUT_DTYPE_t}} _dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 7edc64c59a050..dd66299223efe 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -36,7 +36,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): X, Y, intp_t k, - str metric="euclidean", + metric="euclidean", chunk_size=None, dict metric_kwargs=None, str strategy=None, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp index 3d0ea84b0091d..f9719f6959dfc 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp @@ -8,13 +8,7 @@ from ...utils._typedefs cimport intp_t, float64_t import numpy as np from scipy.sparse import issparse from sklearn.utils.fixes import threadpool_limits - -cpdef enum WeightingStrategy: - uniform = 0 - # TODO: Implement the following options, most likely in - # `weighted_histogram_mode` - distance = 1 - callable = 2 +from ._classmode cimport WeightingStrategy {{for name_suffix in ["32", "64"]}} from ._argkmin cimport ArgKmin{{name_suffix}} @@ -25,8 +19,8 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}): {{name_suffix}}bit implementation of ArgKminClassMode. """ cdef: - const intp_t[:] class_membership, - const intp_t[:] unique_labels + const intp_t[:] Y_labels, + const intp_t[:] unique_Y_labels float64_t[:, :] class_scores cpp_map[intp_t, intp_t] labels_to_index WeightingStrategy weight_type @@ -38,14 +32,14 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}): Y, intp_t k, weights, - class_membership, - unique_labels, + Y_labels, + unique_Y_labels, str metric="euclidean", chunk_size=None, dict metric_kwargs=None, str strategy=None, ): - """Compute the argkmin reduction with class_membership. + """Compute the argkmin reduction with Y_labels. This classmethod is responsible for introspecting the arguments values to dispatch to the most appropriate implementation of @@ -66,8 +60,8 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}): chunk_size=chunk_size, strategy=strategy, weights=weights, - class_membership=class_membership, - unique_labels=unique_labels, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, ) # Limit the number of threads in second level of nested parallelism for BLAS @@ -83,8 +77,8 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}): def __init__( self, DatasetsPair{{name_suffix}} datasets_pair, - const intp_t[:] class_membership, - const intp_t[:] unique_labels, + const intp_t[:] Y_labels, + const intp_t[:] unique_Y_labels, chunk_size=None, strategy=None, intp_t k=1, @@ -103,15 +97,15 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}): self.weight_type = WeightingStrategy.distance else: self.weight_type = WeightingStrategy.callable - self.class_membership = class_membership + self.Y_labels = Y_labels - self.unique_labels = unique_labels + self.unique_Y_labels = unique_Y_labels cdef intp_t idx, neighbor_class_idx # Map from set of unique labels to their indices in `class_scores` # Buffer used in building a histogram for one-pass weighted mode self.class_scores = np.zeros( - (self.n_samples_X, unique_labels.shape[0]), dtype=np.float64, + (self.n_samples_X, unique_Y_labels.shape[0]), dtype=np.float64, ) def _finalize_results(self): @@ -142,7 +136,7 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}): if use_distance_weighting: score_incr = 1 / distances[neighbor_rank] neighbor_idx = indices[neighbor_rank] - neighbor_class_idx = self.class_membership[neighbor_idx] + neighbor_class_idx = self.Y_labels[neighbor_idx] self.class_scores[sample_index][neighbor_class_idx] += score_incr return diff --git a/sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd b/sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd new file mode 100644 index 0000000000000..65db044d668e8 --- /dev/null +++ b/sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd @@ -0,0 +1,5 @@ +cpdef enum WeightingStrategy: + uniform = 0 + # TODO: Implement the following options in weighted_histogram_mode + distance = 1 + callable = 2 diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp index fc56a59cab16f..1e57b3291a8f4 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp @@ -38,22 +38,22 @@ cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef: const {{INPUT_DTYPE_t}}[:] X_data - const int32_t[:] X_indices - const int32_t[:] X_indptr + const int32_t[::1] X_indices + const int32_t[::1] X_indptr const {{INPUT_DTYPE_t}}[:] Y_data - const int32_t[:] Y_indices - const int32_t[:] Y_indptr + const int32_t[::1] Y_indices + const int32_t[::1] Y_indptr cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef: const {{INPUT_DTYPE_t}}[:] X_data - const int32_t[:] X_indices - const int32_t[:] X_indptr + const int32_t[::1] X_indices + const int32_t[::1] X_indptr const {{INPUT_DTYPE_t}}[:] Y_data - const int32_t[:] Y_indices + const int32_t[::1] Y_indices intp_t n_Y diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index 40a9a45e8b8e1..2c3ca44047145 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -1,3 +1,5 @@ +import copy + {{py: implementation_specific_values = [ @@ -53,7 +55,7 @@ cdef class DatasetsPair{{name_suffix}}: cls, X, Y, - str metric="euclidean", + metric="euclidean", dict metric_kwargs=None, ) -> DatasetsPair{{name_suffix}}: """Return the DatasetsPair implementation for the given arguments. @@ -70,7 +72,7 @@ cdef class DatasetsPair{{name_suffix}}: If provided as a ndarray, it must be C-contiguous. If provided as a sparse matrix, it must be in CSR format. - metric : str, default='euclidean' + metric : str or DistanceMetric object, default='euclidean' The distance metric to compute between rows of X and Y. The default metric is a fast implementation of the Euclidean metric. For a list of available metrics, see the documentation @@ -84,12 +86,17 @@ cdef class DatasetsPair{{name_suffix}}: datasets_pair: DatasetsPair{{name_suffix}} The suited DatasetsPair{{name_suffix}} implementation. """ - # Y_norm_squared might be propagated down to DatasetsPairs - # via metrics_kwargs when the Euclidean specialisations - # can't be used. To prevent Y_norm_squared to be passed + # X_norm_squared and Y_norm_squared might be propagated + # down to DatasetsPairs via metrics_kwargs when the Euclidean + # specialisations can't be used. + # To prevent X_norm_squared and Y_norm_squared to be passed # down to DistanceMetrics (whose constructors would raise - # a RuntimeError), we pop it here. + # a RuntimeError), we pop them here. if metric_kwargs is not None: + # Copying metric_kwargs not to pop "X_norm_squared" + # and "Y_norm_squared" where they are used + metric_kwargs = copy.copy(metric_kwargs) + metric_kwargs.pop("X_norm_squared", None) metric_kwargs.pop("Y_norm_squared", None) cdef: {{DistanceMetric}} distance_metric = DistanceMetric.get_metric( @@ -231,9 +238,9 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil: return self.distance_metric.rdist_csr( x1_data=&self.X_data[0], - x1_indices=self.X_indices, + x1_indices=&self.X_indices[0], x2_data=&self.Y_data[0], - x2_indices=self.Y_indices, + x2_indices=&self.Y_indices[0], x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], x2_start=self.Y_indptr[j], @@ -245,9 +252,9 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil: return self.distance_metric.dist_csr( x1_data=&self.X_data[0], - x1_indices=self.X_indices, + x1_indices=&self.X_indices[0], x2_data=&self.Y_data[0], - x2_indices=self.Y_indices, + x2_indices=&self.Y_indices[0], x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], x2_start=self.Y_indptr[j], @@ -324,11 +331,11 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil: return self.distance_metric.rdist_csr( x1_data=&self.X_data[0], - x1_indices=self.X_indices, + x1_indices=&self.X_indices[0], # Increment the data pointer such that x2_start=0 is aligned with the # j-th row x2_data=&self.Y_data[0] + j * self.n_features, - x2_indices=self.Y_indices, + x2_indices=&self.Y_indices[0], x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], x2_start=0, @@ -341,11 +348,11 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): return self.distance_metric.dist_csr( x1_data=&self.X_data[0], - x1_indices=self.X_indices, + x1_indices=&self.X_indices[0], # Increment the data pointer such that x2_start=0 is aligned with the # j-th row x2_data=&self.Y_data[0] + j * self.n_features, - x2_indices=self.Y_indices, + x2_indices=&self.Y_indices[0], x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], x2_start=0, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 796f15ab6fca0..e23da467d723a 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -5,7 +5,11 @@ from scipy.sparse import issparse from ... import get_config -from .._dist_metrics import BOOL_METRICS, METRIC_MAPPING64 +from .._dist_metrics import ( + BOOL_METRICS, + METRIC_MAPPING64, + DistanceMetric, +) from ._argkmin import ( ArgKmin32, ArgKmin64, @@ -117,7 +121,7 @@ def is_valid_sparse_matrix(X): and (is_numpy_c_ordered(Y) or is_valid_sparse_matrix(Y)) and X.dtype == Y.dtype and X.dtype in (np.float32, np.float64) - and metric in cls.valid_metrics() + and (metric in cls.valid_metrics() or isinstance(metric, DistanceMetric)) ) return is_usable @@ -456,7 +460,7 @@ def is_usable_for(cls, X, Y, metric) -> bool: The input array to be labelled. Y : ndarray of shape (n_samples_Y, n_features) - The input array whose labels are provided through the `labels` + The input array whose labels are provided through the `Y_labels` parameter. metric : str, default='euclidean' @@ -484,8 +488,8 @@ def compute( Y, k, weights, - labels, - unique_labels, + Y_labels, + unique_Y_labels, metric="euclidean", chunk_size=None, metric_kwargs=None, @@ -499,23 +503,23 @@ def compute( The input array to be labelled. Y : ndarray of shape (n_samples_Y, n_features) - The input array whose labels are provided through the `labels` - parameter. + The input array whose class membership are provided through the + `Y_labels` parameter. k : int The number of nearest neighbors to consider. weights : ndarray - The weights applied over the `labels` of `Y` when computing the + The weights applied over the `Y_labels` of `Y` when computing the weighted mode of the labels. - class_membership : ndarray + Y_labels : ndarray An array containing the index of the class membership of the associated samples in `Y`. This is used in labeling `X`. - unique_classes : ndarray + unique_Y_labels : ndarray An array containing all unique indices contained in the - corresponding `class_membership` array. + corresponding `Y_labels` array. metric : str, default='euclidean' The distance metric to use. For a list of available metrics, see @@ -587,8 +591,8 @@ def compute( Y=Y, k=k, weights=weights, - class_membership=np.array(labels, dtype=np.intp), - unique_labels=np.array(unique_labels, dtype=np.intp), + Y_labels=np.array(Y_labels, dtype=np.intp), + unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp), metric=metric, chunk_size=chunk_size, metric_kwargs=metric_kwargs, @@ -601,8 +605,8 @@ def compute( Y=Y, k=k, weights=weights, - class_membership=np.array(labels, dtype=np.intp), - unique_labels=np.array(unique_labels, dtype=np.intp), + Y_labels=np.array(Y_labels, dtype=np.intp), + unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp), metric=metric, chunk_size=chunk_size, metric_kwargs=metric_kwargs, diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index aee1615c55630..302831366aa54 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -124,7 +124,7 @@ def __call__(self, estimator, *args, **kwargs): cached_call = partial(_cached_call, cache) if _routing_enabled(): - routed_params = process_routing(self, "score", kwargs) + routed_params = process_routing(self, "score", **kwargs) else: # they all get the same args, and they all get them all routed_params = Bunch( @@ -293,6 +293,13 @@ def set_score_request(self, **kwargs): Arguments should be of the form ``param_name=alias``, and `alias` can be one of ``{True, False, None, str}``. """ + if not _routing_enabled(): + raise RuntimeError( + "This method is only available when metadata routing is enabled." + " You can enable it using" + " sklearn.set_config(enable_metadata_routing=True)." + ) + self._warn_overlap( message=( "You are setting metadata request for parameters which are " diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index a05a532ecb3f2..cfcb08a312443 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -159,10 +159,10 @@ def test_classification_report_dictionary_output(): for metric in expected_report[key]: assert_almost_equal(expected_report[key][metric], report[key][metric]) - assert type(expected_report["setosa"]["precision"]) == float - assert type(expected_report["macro avg"]["precision"]) == float - assert type(expected_report["setosa"]["support"]) == int - assert type(expected_report["macro avg"]["support"]) == int + assert isinstance(expected_report["setosa"]["precision"], float) + assert isinstance(expected_report["macro avg"]["precision"], float) + assert isinstance(expected_report["setosa"]["support"], int) + assert isinstance(expected_report["macro avg"]["support"], int) def test_classification_report_output_dict_empty_input(): diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 16aa5c569b161..7d44b988b9161 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -15,6 +15,7 @@ ) from sklearn.utils import check_random_state from sklearn.utils._testing import assert_allclose, create_memmap_backed_data +from sklearn.utils.fixes import parse_version, sp_version def dist_func(x1, x2, p): @@ -42,18 +43,17 @@ def dist_func(x1, x2, p): V = rng.random_sample((d, d)) VI = np.dot(V, V.T) - METRICS_DEFAULT_PARAMS = [ ("euclidean", {}), ("cityblock", {}), - ("minkowski", dict(p=(1, 1.5, 2, 3))), + ("minkowski", dict(p=(0.5, 1, 1.5, 2, 3))), ("chebyshev", {}), ("seuclidean", dict(V=(rng.random_sample(d),))), ("mahalanobis", dict(VI=(VI,))), ("hamming", {}), ("canberra", {}), ("braycurtis", {}), - ("minkowski", dict(p=(1, 1.5, 3), w=(rng.random_sample(d),))), + ("minkowski", dict(p=(0.5, 1, 1.5, 3), w=(rng.random_sample(d),))), ] @@ -76,6 +76,13 @@ def test_cdist(metric_param_grid, X, Y): # with scipy rtol_dict = {"rtol": 1e-6} + # TODO: Remove when scipy minimum version >= 1.7.0 + # scipy supports 0= 1.7.0 + if metric == "minkowski": + p = kwargs["p"] + if sp_version < parse_version("1.7.0") and p < 1: + pytest.skip("scipy does not support 0= 1.7.0 + # scipy supports 0= 1.7.0 + if metric == "minkowski": + p = kwargs["p"] + if sp_version < parse_version("1.7.0") and p < 1: + pytest.skip("scipy does not support 0>> print(scores['train_r2']) [0.28009951 0.3908844 0.22784907] """ - X, y, groups = indexable(X, y, groups) + params = _check_params_groups_deprecation(fit_params, params, groups) + + X, y = indexable(X, y) cv = check_cv(cv, y, classifier=is_classifier(estimator)) @@ -298,7 +359,62 @@ def cross_validate( else: scorers = _check_multimetric_scoring(estimator, scoring) - indices = cv.split(X, y, groups) + if _routing_enabled(): + # `cross_validate` will create a `_MultiMetricScorer` if `scoring` is a + # dict at a later stage. We need the same object for the purpose of + # routing. However, creating it here and passing it around would create + # a much larger diff since the dict is used in many places. + if isinstance(scorers, dict): + _scorer = _MultimetricScorer( + scorers=scorers, raise_exc=(error_score == "raise") + ) + else: + _scorer = scorers + # For estimators, a MetadataRouter is created in get_metadata_routing + # methods. For these router methods, we create the router to use + # `process_routing` on it. + router = ( + MetadataRouter(owner="cross_validate") + .add( + splitter=cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + .add( + estimator=estimator, + # TODO(SLEP6): also pass metadata to the predict method for + # scoring? + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + .add( + scorer=_scorer, + method_mapping=MethodMapping().add(caller="fit", callee="score"), + ) + ) + try: + routed_params = process_routing(router, "fit", **params) + except UnsetMetadataPassedError as e: + # The default exception would mention `fit` since in the above + # `process_routing` code, we pass `fit` as the caller. However, + # the user is not calling `fit` directly, so we change the message + # to make it more suitable for this case. + raise UnsetMetadataPassedError( + message=( + f"{sorted(e.unrequested_params.keys())} are passed to cross" + " validation but are not explicitly requested or unrequested. See" + " the Metadata Routing User guide" + " for more" + " information." + ), + unrequested_params=e.unrequested_params, + routed_params=e.routed_params, + ) + else: + routed_params = Bunch() + routed_params.splitter = Bunch(split={"groups": groups}) + routed_params.estimator = Bunch(fit=params) + routed_params.scorer = Bunch(score={}) + + indices = cv.split(X, y, **routed_params.splitter.split) if return_indices: # materialize the indices since we need to store them in the returned dict indices = list(indices) @@ -311,12 +427,13 @@ def cross_validate( clone(estimator), X, y, - scorers, - train, - test, - verbose, - None, - fit_params, + scorer=scorers, + train=train, + test=test, + verbose=verbose, + parameters=None, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, return_train_score=return_train_score, return_times=True, return_estimator=return_estimator, @@ -436,6 +553,7 @@ def _warn_or_raise_about_fit_failures(results, error_score): "n_jobs": [Integral, None], "verbose": ["verbose"], "fit_params": [dict, None], + "params": [dict, None], "pre_dispatch": [Integral, str, None], "error_score": [StrOptions({"raise"}), Real], }, @@ -452,6 +570,7 @@ def cross_val_score( n_jobs=None, verbose=0, fit_params=None, + params=None, pre_dispatch="2*n_jobs", error_score=np.nan, ): @@ -477,6 +596,13 @@ def cross_val_score( train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`GroupKFold`). + .. versionchanged:: 1.4 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``cross_val_score(..., params={'groups': groups})``. + scoring : str or callable, default=None A str (see model evaluation documentation) or a scorer callable object / function with signature @@ -521,6 +647,16 @@ def cross_val_score( fit_params : dict, default=None Parameters to pass to the fit method of the estimator. + .. deprecated:: 1.4 + This parameter is deprecated and will be removed in version 1.6. Use + ``params`` instead. + + params : dict, default=None + Parameters to pass to the underlying estimator's ``fit``, the scorer, + and the CV splitter. + + .. versionadded:: 1.4 + pre_dispatch : int or str, default='2*n_jobs' Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an @@ -585,6 +721,7 @@ def cross_val_score( n_jobs=n_jobs, verbose=verbose, fit_params=fit_params, + params=params, pre_dispatch=pre_dispatch, error_score=error_score, ) @@ -595,12 +732,14 @@ def _fit_and_score( estimator, X, y, + *, scorer, train, test, verbose, parameters, fit_params, + score_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, @@ -654,6 +793,9 @@ def _fit_and_score( fit_params : dict or None Parameters that will be passed to ``estimator.fit``. + score_params : dict or None + Parameters that will be passed to the scorer. + return_train_score : bool, default=False Compute and return score on training set. @@ -724,6 +866,9 @@ def _fit_and_score( # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = _check_method_params(X, params=fit_params, indices=train) + score_params = score_params if score_params is not None else {} + score_params_train = _check_method_params(X, params=score_params, indices=train) + score_params_test = _check_method_params(X, params=score_params, indices=test) if parameters is not None: # here we clone the parameters, since sometimes the parameters @@ -764,10 +909,14 @@ def _fit_and_score( result["fit_error"] = None fit_time = time.time() - start_time - test_scores = _score(estimator, X_test, y_test, scorer, error_score) + test_scores = _score( + estimator, X_test, y_test, scorer, score_params_test, error_score + ) score_time = time.time() - start_time - fit_time if return_train_score: - train_scores = _score(estimator, X_train, y_train, scorer, error_score) + train_scores = _score( + estimator, X_train, y_train, scorer, score_params_train, error_score + ) if verbose > 1: total_time = score_time + fit_time @@ -809,7 +958,7 @@ def _fit_and_score( return result -def _score(estimator, X_test, y_test, scorer, error_score="raise"): +def _score(estimator, X_test, y_test, scorer, score_params, error_score="raise"): """Compute the score(s) of an estimator on a given test set. Will return a dict of floats if `scorer` is a dict, otherwise a single @@ -819,11 +968,13 @@ def _score(estimator, X_test, y_test, scorer, error_score="raise"): # will cache method calls if needed. scorer() returns a dict scorer = _MultimetricScorer(scorers=scorer, raise_exc=(error_score == "raise")) + score_params = {} if score_params is None else score_params + try: if y_test is None: - scores = scorer(estimator, X_test) + scores = scorer(estimator, X_test, **score_params) else: - scores = scorer(estimator, X_test, y_test) + scores = scorer(estimator, X_test, y_test, **score_params) except Exception: if isinstance(scorer, _MultimetricScorer): # If `_MultimetricScorer` raises exception, the `error_score` @@ -891,6 +1042,7 @@ def _score(estimator, X_test, y_test, scorer, error_score="raise"): "n_jobs": [Integral, None], "verbose": ["verbose"], "fit_params": [dict, None], + "params": [dict, None], "pre_dispatch": [Integral, str, None], "method": [ StrOptions( @@ -915,6 +1067,7 @@ def cross_val_predict( n_jobs=None, verbose=0, fit_params=None, + params=None, pre_dispatch="2*n_jobs", method="predict", ): @@ -950,6 +1103,13 @@ def cross_val_predict( train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`GroupKFold`). + .. versionchanged:: 1.4 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``cross_val_predict(..., params={'groups': groups})``. + cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -983,6 +1143,16 @@ def cross_val_predict( fit_params : dict, default=None Parameters to pass to the fit method of the estimator. + .. deprecated:: 1.4 + This parameter is deprecated and will be removed in version 1.6. Use + ``params`` instead. + + params : dict, default=None + Parameters to pass to the underlying estimator's ``fit`` and the CV + splitter. + + .. versionadded:: 1.4 + pre_dispatch : int or str, default='2*n_jobs' Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an @@ -1042,10 +1212,50 @@ def cross_val_predict( >>> lasso = linear_model.Lasso() >>> y_pred = cross_val_predict(lasso, X, y, cv=3) """ - X, y, groups = indexable(X, y, groups) + params = _check_params_groups_deprecation(fit_params, params, groups) + X, y = indexable(X, y) + + if _routing_enabled(): + # For estimators, a MetadataRouter is created in get_metadata_routing + # methods. For these router methods, we create the router to use + # `process_routing` on it. + router = ( + MetadataRouter(owner="cross_validate") + .add( + splitter=cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + .add( + estimator=estimator, + # TODO(SLEP6): also pass metadata for the predict method. + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + ) + try: + routed_params = process_routing(router, "fit", **params) + except UnsetMetadataPassedError as e: + # The default exception would mention `fit` since in the above + # `process_routing` code, we pass `fit` as the caller. However, + # the user is not calling `fit` directly, so we change the message + # to make it more suitable for this case. + raise UnsetMetadataPassedError( + message=( + f"{sorted(e.unrequested_params.keys())} are passed to cross" + " validation but are not explicitly requested or unrequested. See" + " the Metadata Routing User guide" + " for more" + " information." + ), + unrequested_params=e.unrequested_params, + routed_params=e.routed_params, + ) + else: + routed_params = Bunch() + routed_params.splitter = Bunch(split={"groups": groups}) + routed_params.estimator = Bunch(fit=params) cv = check_cv(cv, y, classifier=is_classifier(estimator)) - splits = list(cv.split(X, y, groups)) + splits = list(cv.split(X, y, **routed_params.splitter.split)) test_indices = np.concatenate([test for _, test in splits]) if not _check_is_permutation(test_indices, _num_samples(X)): @@ -1073,7 +1283,13 @@ def cross_val_predict( parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) predictions = parallel( delayed(_fit_and_predict)( - clone(estimator), X, y, train, test, verbose, fit_params, method + clone(estimator), + X, + y, + train, + test, + routed_params.estimator.fit, + method, ) for train, test in splits ) @@ -1103,7 +1319,7 @@ def cross_val_predict( return predictions[inv_test_indices] -def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, method): +def _fit_and_predict(estimator, X, y, train, test, fit_params, method): """Fit estimator and predict values for a given dataset split. Read more in the :ref:`User Guide `. @@ -1129,9 +1345,6 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, method): test : array-like of shape (n_test_samples,) Indices of test samples. - verbose : int - The verbosity level. - fit_params : dict or None Parameters that will be passed to ``estimator.fit``. @@ -1707,7 +1920,6 @@ def learning_curve( test, train_sizes_abs, scorer, - verbose, return_times, error_score=error_score, fit_params=fit_params, @@ -1726,12 +1938,14 @@ def learning_curve( clone(estimator), X, y, - scorer, - train, - test, - verbose, + scorer=scorer, + train=train, + test=test, + verbose=verbose, parameters=None, fit_params=fit_params, + # TODO(SLEP6): support score params here + score_params=None, return_train_score=True, error_score=error_score, return_times=return_times, @@ -1833,7 +2047,6 @@ def _incremental_fit_estimator( test, train_sizes, scorer, - verbose, return_times, error_score, fit_params, @@ -1863,9 +2076,27 @@ def _incremental_fit_estimator( start_score = time.time() - test_scores.append(_score(estimator, X_test, y_test, scorer, error_score)) - train_scores.append(_score(estimator, X_train, y_train, scorer, error_score)) - + # TODO(SLEP6): support score params in the following two calls + test_scores.append( + _score( + estimator, + X_test, + y_test, + scorer, + score_params=None, + error_score=error_score, + ) + ) + train_scores.append( + _score( + estimator, + X_train, + y_train, + scorer, + score_params=None, + error_score=error_score, + ) + ) score_time = time.time() - start_score score_times.append(score_time) @@ -2025,12 +2256,14 @@ def validation_curve( clone(estimator), X, y, - scorer, - train, - test, - verbose, + scorer=scorer, + train=train, + test=test, + verbose=verbose, parameters={param_name: v}, fit_params=fit_params, + # TODO(SLEP6): support score params here + score_params=None, return_train_score=True, error_score=error_score, ) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 6ea52049f3ced..04c3f1f156fab 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -900,18 +900,16 @@ def check_cv_results_array_types(search, param_keys, score_keys): assert cv_results["rank_test_%s" % key].dtype == np.int32 -def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand): +def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand, extra_keys=()): # Test the search.cv_results_ contains all the required results - assert_array_equal( - sorted(cv_results.keys()), sorted(param_keys + score_keys + ("params",)) - ) + all_keys = param_keys + score_keys + extra_keys + assert_array_equal(sorted(cv_results.keys()), sorted(all_keys + ("params",))) assert all(cv_results[key].shape == (n_cand,) for key in param_keys + score_keys) def test_grid_search_cv_results(): X, y = make_classification(n_samples=50, n_features=4, random_state=42) - n_splits = 3 n_grid_points = 6 params = [ dict( @@ -949,9 +947,7 @@ def test_grid_search_cv_results(): ) n_candidates = n_grid_points - search = GridSearchCV( - SVC(), cv=n_splits, param_grid=params, return_train_score=True - ) + search = GridSearchCV(SVC(), cv=3, param_grid=params, return_train_score=True) search.fit(X, y) cv_results = search.cv_results_ # Check if score and timing are reasonable @@ -967,17 +963,20 @@ def test_grid_search_cv_results(): check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates) # Check masking cv_results = search.cv_results_ - n_candidates = len(search.cv_results_["params"]) - assert all( + + poly_results = [ ( cv_results["param_C"].mask[i] and cv_results["param_gamma"].mask[i] and not cv_results["param_degree"].mask[i] ) for i in range(n_candidates) - if cv_results["param_kernel"][i] == "linear" - ) - assert all( + if cv_results["param_kernel"][i] == "poly" + ] + assert all(poly_results) + assert len(poly_results) == 2 + + rbf_results = [ ( not cv_results["param_C"].mask[i] and not cv_results["param_gamma"].mask[i] @@ -985,13 +984,14 @@ def test_grid_search_cv_results(): ) for i in range(n_candidates) if cv_results["param_kernel"][i] == "rbf" - ) + ] + assert all(rbf_results) + assert len(rbf_results) == 4 def test_random_search_cv_results(): X, y = make_classification(n_samples=50, n_features=4, random_state=42) - n_splits = 3 n_search_iter = 30 params = [ @@ -1016,12 +1016,12 @@ def test_random_search_cv_results(): "mean_score_time", "std_score_time", ) - n_cand = n_search_iter + n_candidates = n_search_iter search = RandomizedSearchCV( SVC(), n_iter=n_search_iter, - cv=n_splits, + cv=3, param_distributions=params, return_train_score=True, ) @@ -1029,8 +1029,7 @@ def test_random_search_cv_results(): cv_results = search.cv_results_ # Check results structure check_cv_results_array_types(search, param_keys, score_keys) - check_cv_results_keys(cv_results, param_keys, score_keys, n_cand) - n_candidates = len(search.cv_results_["params"]) + check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates) assert all( ( cv_results["param_C"].mask[i] @@ -1038,7 +1037,7 @@ def test_random_search_cv_results(): and not cv_results["param_degree"].mask[i] ) for i in range(n_candidates) - if cv_results["param_kernel"][i] == "linear" + if cv_results["param_kernel"][i] == "poly" ) assert all( ( diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index d92f624441541..151498205dd39 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -14,6 +14,7 @@ ) from scipy.special import comb +from sklearn import config_context from sklearn.datasets import load_digits, make_classification from sklearn.dummy import DummyClassifier from sklearn.model_selection import ( @@ -43,7 +44,15 @@ _yields_constant_splits, ) from sklearn.svm import SVC -from sklearn.tests.test_metadata_routing import assert_request_is_empty +from sklearn.tests.metadata_routing_common import assert_request_is_empty +from sklearn.utils._array_api import ( + _convert_to_numpy, + get_namespace, + yield_namespace_device_dtype_combinations, +) +from sklearn.utils._array_api import ( + device as array_api_device, +) from sklearn.utils._mocking import MockDataFrame from sklearn.utils._testing import ( assert_allclose, @@ -51,6 +60,9 @@ assert_array_equal, ignore_warnings, ) +from sklearn.utils.estimator_checks import ( + _array_api_for_tests, +) from sklearn.utils.validation import _num_samples NO_GROUP_SPLITTERS = [ @@ -1259,6 +1271,70 @@ def test_train_test_split_default_test_size(train_size, exp_train, exp_test): assert len(X_test) == exp_test +@pytest.mark.parametrize( + "array_namepsace, device, dtype", yield_namespace_device_dtype_combinations() +) +@pytest.mark.parametrize( + "shuffle,stratify", + ( + (True, None), + (True, np.hstack((np.ones(6), np.zeros(4)))), + # stratification only works with shuffling + (False, None), + ), +) +def test_array_api_train_test_split(shuffle, stratify, array_namepsace, device, dtype): + xp, device, dtype = _array_api_for_tests(array_namepsace, device, dtype) + + X = np.arange(100).reshape((10, 10)) + y = np.arange(10) + + X_np = X.astype(dtype) + X_xp = xp.asarray(X_np, device=device) + + y_np = y.astype(dtype) + y_xp = xp.asarray(y_np, device=device) + + X_train_np, X_test_np, y_train_np, y_test_np = train_test_split( + X_np, y, random_state=0, shuffle=shuffle, stratify=stratify + ) + with config_context(array_api_dispatch=True): + if stratify is not None: + stratify_xp = xp.asarray(stratify) + else: + stratify_xp = stratify + X_train_xp, X_test_xp, y_train_xp, y_test_xp = train_test_split( + X_xp, y_xp, shuffle=shuffle, stratify=stratify_xp, random_state=0 + ) + + # Check that namespace is preserved, has to happen with + # array_api_dispatch enabled. + assert get_namespace(X_train_xp)[0] == get_namespace(X_xp)[0] + assert get_namespace(X_test_xp)[0] == get_namespace(X_xp)[0] + assert get_namespace(y_train_xp)[0] == get_namespace(y_xp)[0] + assert get_namespace(y_test_xp)[0] == get_namespace(y_xp)[0] + + # Check device and dtype is preserved on output + assert array_api_device(X_train_xp) == array_api_device(X_xp) + assert array_api_device(y_train_xp) == array_api_device(y_xp) + assert array_api_device(X_test_xp) == array_api_device(X_xp) + assert array_api_device(y_test_xp) == array_api_device(y_xp) + + assert X_train_xp.dtype == X_xp.dtype + assert y_train_xp.dtype == y_xp.dtype + assert X_test_xp.dtype == X_xp.dtype + assert y_test_xp.dtype == y_xp.dtype + + assert_allclose( + _convert_to_numpy(X_train_xp, xp=xp), + X_train_np, + ) + assert_allclose( + _convert_to_numpy(X_test_xp, xp=xp), + X_test_np, + ) + + def test_train_test_split(): X = np.arange(100).reshape((10, 10)) X_s = coo_matrix(X) @@ -1808,7 +1884,7 @@ def test_nested_cv(): error_score="raise", ) cross_val_score( - gs, X=X, y=y, groups=groups, cv=outer_cv, fit_params={"groups": groups} + gs, X=X, y=y, groups=groups, cv=outer_cv, params={"groups": groups} ) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index d4cc09ee01044..6c89f89afa684 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -2,7 +2,7 @@ import numpy as np import pytest -from scipy.stats import norm, randint +from scipy.stats import expon, norm, randint from sklearn.datasets import make_classification from sklearn.dummy import DummyClassifier @@ -23,7 +23,11 @@ _SubsampleMetaSplitter, _top_k, ) -from sklearn.svm import LinearSVC +from sklearn.model_selection.tests.test_search import ( + check_cv_results_array_types, + check_cv_results_keys, +) +from sklearn.svm import SVC, LinearSVC class FastClassifier(DummyClassifier): @@ -777,3 +781,68 @@ def test_select_best_index(SearchCV): # we expect the index of 'i' best_index = SearchCV._select_best_index(None, None, results) assert best_index == 8 + + +def test_halving_random_search_list_of_dicts(): + """Check the behaviour of the `HalvingRandomSearchCV` with `param_distribution` + being a list of dictionary. + """ + X, y = make_classification(n_samples=150, n_features=4, random_state=42) + + params = [ + {"kernel": ["rbf"], "C": expon(scale=10), "gamma": expon(scale=0.1)}, + {"kernel": ["poly"], "degree": [2, 3]}, + ] + param_keys = ( + "param_C", + "param_degree", + "param_gamma", + "param_kernel", + ) + score_keys = ( + "mean_test_score", + "mean_train_score", + "rank_test_score", + "split0_test_score", + "split1_test_score", + "split2_test_score", + "split0_train_score", + "split1_train_score", + "split2_train_score", + "std_test_score", + "std_train_score", + "mean_fit_time", + "std_fit_time", + "mean_score_time", + "std_score_time", + ) + extra_keys = ("n_resources", "iter") + + search = HalvingRandomSearchCV( + SVC(), cv=3, param_distributions=params, return_train_score=True, random_state=0 + ) + search.fit(X, y) + n_candidates = sum(search.n_candidates_) + cv_results = search.cv_results_ + # Check results structure + check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates, extra_keys) + check_cv_results_array_types(search, param_keys, score_keys) + + assert all( + ( + cv_results["param_C"].mask[i] + and cv_results["param_gamma"].mask[i] + and not cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "poly" + ) + assert all( + ( + not cv_results["param_C"].mask[i] + and not cv_results["param_gamma"].mask[i] + and cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "rbf" + ) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index ba9f66ab240e4..c944b06b30860 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -73,6 +73,13 @@ from sklearn.pipeline import Pipeline from sklearn.preprocessing import LabelEncoder, scale from sklearn.svm import SVC, LinearSVC +from sklearn.tests.metadata_routing_common import ( + ConsumingClassifier, + ConsumingScorer, + ConsumingSplitter, + _Registry, + check_recorded_metadata, +) from sklearn.utils import shuffle from sklearn.utils._mocking import CheckingClassifier, MockDataFrame from sklearn.utils._testing import ( @@ -706,7 +713,7 @@ def assert_fit_params(clf): "dummy_obj": DUMMY_OBJ, "callback": assert_fit_params, } - cross_val_score(clf, X, y, fit_params=fit_params) + cross_val_score(clf, X, y, params=fit_params) def test_cross_val_score_score_func(): @@ -1160,7 +1167,7 @@ def test_cross_val_score_sparse_fit_params(): X, y = iris.data, iris.target clf = MockClassifier() fit_params = {"sparse_sample_weight": coo_matrix(np.eye(X.shape[0]))} - a = cross_val_score(clf, X, y, fit_params=fit_params, cv=3) + a = cross_val_score(clf, X, y, params=fit_params, cv=3) assert_array_equal(a, np.ones(3)) @@ -2082,12 +2089,23 @@ def test_fit_and_score_failing(): failing_clf = FailingClassifier(FailingClassifier.FAILING_PARAMETER) # dummy X data X = np.arange(1, 10) - fit_and_score_args = [failing_clf, X, None, dict(), None, None, 0, None, None] + fit_and_score_args = dict( + estimator=failing_clf, + X=X, + y=None, + scorer=dict(), + train=None, + test=None, + verbose=0, + parameters=None, + fit_params=None, + score_params=None, + ) # passing error score to trigger the warning message - fit_and_score_kwargs = {"error_score": "raise"} + fit_and_score_args["error_score"] = "raise" # check if exception was raised, with default error_score='raise' with pytest.raises(ValueError, match="Failing classifier failed as required"): - _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs) + _fit_and_score(**fit_and_score_args) assert failing_clf.score() == 0.0 # FailingClassifier coverage @@ -2097,14 +2115,21 @@ def test_fit_and_score_working(): clf = SVC(kernel="linear", random_state=0) train, test = next(ShuffleSplit().split(X)) # Test return_parameters option - fit_and_score_args = [clf, X, y, dict(), train, test, 0] - fit_and_score_kwargs = { - "parameters": {"max_iter": 100, "tol": 0.1}, - "fit_params": None, - "return_parameters": True, - } - result = _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs) - assert result["parameters"] == fit_and_score_kwargs["parameters"] + fit_and_score_args = dict( + estimator=clf, + X=X, + y=y, + scorer=dict(), + train=train, + test=test, + verbose=0, + parameters={"max_iter": 100, "tol": 0.1}, + fit_params=None, + score_params=None, + return_parameters=True, + ) + result = _fit_and_score(**fit_and_score_args) + assert result["parameters"] == fit_and_score_args["parameters"] class DataDependentFailingClassifier(BaseEstimator): @@ -2315,13 +2340,22 @@ def test_fit_and_score_verbosity( train, test = next(ShuffleSplit().split(X)) # test print without train score - fit_and_score_args = [clf, X, y, scorer, train, test, verbose, None, None] - fit_and_score_kwargs = { - "return_train_score": train_score, - "split_progress": split_prg, - "candidate_progress": cdt_prg, - } - _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs) + fit_and_score_args = dict( + estimator=clf, + X=X, + y=y, + scorer=scorer, + train=train, + test=test, + verbose=verbose, + parameters=None, + fit_params=None, + score_params=None, + return_train_score=train_score, + split_progress=split_prg, + candidate_progress=cdt_prg, + ) + _fit_and_score(**fit_and_score_args) out, _ = capsys.readouterr() outlines = out.split("\n") if len(outlines) > 2: @@ -2336,9 +2370,15 @@ def test_score(): def two_params_scorer(estimator, X_test): return None - fit_and_score_args = [None, None, None, two_params_scorer] with pytest.raises(ValueError, match=error_message): - _score(*fit_and_score_args, error_score=np.nan) + _score( + estimator=None, + X_test=None, + y_test=None, + scorer=two_params_scorer, + score_params=None, + error_score=np.nan, + ) def test_callable_multimetric_confusion_matrix_cross_validate(): @@ -2391,3 +2431,149 @@ def test_cross_validate_return_indices(global_random_seed): for split_idx, (expected_train_idx, expected_test_idx) in enumerate(cv.split(X, y)): assert_array_equal(train_indices[split_idx], expected_train_idx) assert_array_equal(test_indices[split_idx], expected_test_idx) + + +# Tests for metadata routing in cross_val* +# ======================================== + + +# TODO(1.6): remove this test in 1.6 +def test_cross_validate_fit_param_deprecation(): + """Check that we warn about deprecating `fit_params`.""" + with pytest.warns(FutureWarning, match="`fit_params` is deprecated"): + cross_validate(estimator=ConsumingClassifier(), X=X, y=y, cv=2, fit_params={}) + + with pytest.raises( + ValueError, match="`params` and `fit_params` cannot both be provided" + ): + cross_validate( + estimator=ConsumingClassifier(), X=X, y=y, fit_params={}, params={} + ) + + +@pytest.mark.usefixtures("enable_slep006") +@pytest.mark.parametrize( + "cv_method", [cross_validate, cross_val_score, cross_val_predict] +) +def test_groups_with_routing_validation(cv_method): + """Check that we raise an error if `groups` are passed to the cv method instead + of `params` when metadata routing is enabled. + """ + with pytest.raises(ValueError, match="`groups` can only be passed if"): + cv_method( + estimator=ConsumingClassifier(), + X=X, + y=y, + groups=[], + ) + + +@pytest.mark.usefixtures("enable_slep006") +@pytest.mark.parametrize( + "cv_method", [cross_validate, cross_val_score, cross_val_predict] +) +def test_passed_unrequested_metadata(cv_method): + """Check that we raise an error when passing metadata that is not + requested.""" + err_msg = re.escape("['metadata'] are passed to cross validation") + with pytest.raises(ValueError, match=err_msg): + cv_method( + estimator=ConsumingClassifier(), + X=X, + y=y, + params=dict(metadata=[]), + ) + + +@pytest.mark.usefixtures("enable_slep006") +@pytest.mark.parametrize( + "cv_method", [cross_validate, cross_val_score, cross_val_predict] +) +def test_cross_validate_routing(cv_method): + """Check that the respective cv method is properly dispatching the metadata + to the consumer.""" + scorer_registry = _Registry() + scorer = ConsumingScorer(registry=scorer_registry).set_score_request( + sample_weight="score_weights", metadata="score_metadata" + ) + splitter_registry = _Registry() + splitter = ConsumingSplitter(registry=splitter_registry).set_split_request( + groups="split_groups", metadata="split_metadata" + ) + estimator_registry = _Registry() + estimator = ConsumingClassifier(registry=estimator_registry).set_fit_request( + sample_weight="fit_sample_weight", metadata="fit_metadata" + ) + n_samples = _num_samples(X) + rng = np.random.RandomState(0) + score_weights = rng.rand(n_samples) + score_metadata = rng.rand(n_samples) + split_groups = rng.randint(0, 3, n_samples) + split_metadata = rng.rand(n_samples) + fit_sample_weight = rng.rand(n_samples) + fit_metadata = rng.rand(n_samples) + + extra_params = { + cross_validate: dict(scoring=dict(my_scorer=scorer, accuracy="accuracy")), + # cross_val_score doesn't support multiple scorers + cross_val_score: dict(scoring=scorer), + # cross_val_predict doesn't need a scorer + cross_val_predict: dict(), + } + + params = dict( + split_groups=split_groups, + split_metadata=split_metadata, + fit_sample_weight=fit_sample_weight, + fit_metadata=fit_metadata, + ) + + if cv_method is not cross_val_predict: + params.update( + score_weights=score_weights, + score_metadata=score_metadata, + ) + + cv_method( + estimator, + X=X, + y=y, + cv=splitter, + **extra_params[cv_method], + params=params, + ) + + if cv_method is not cross_val_predict: + # cross_val_predict doesn't need a scorer + assert len(scorer_registry) + for _scorer in scorer_registry: + check_recorded_metadata( + obj=_scorer, + method="score", + split_params=("sample_weight", "metadata"), + sample_weight=score_weights, + metadata=score_metadata, + ) + + assert len(splitter_registry) + for _splitter in splitter_registry: + check_recorded_metadata( + obj=_splitter, + method="split", + groups=split_groups, + metadata=split_metadata, + ) + + assert len(estimator_registry) + for _estimator in estimator_registry: + check_recorded_metadata( + obj=_estimator, + method="fit", + split_params=("sample_weight", "metadata"), + sample_weight=fit_sample_weight, + metadata=fit_metadata, + ) + + +# End of metadata routing tests +# ============================= diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 8bd71924f954b..a75f41307b758 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -163,10 +163,10 @@ def partial_fit(self, X, y, classes=None, sample_weight=None, **partial_fit_para if _routing_enabled(): routed_params = process_routing( - obj=self, - method="partial_fit", - other_params=partial_fit_params, + self, + "partial_fit", sample_weight=sample_weight, + **partial_fit_params, ) else: if sample_weight is not None and not has_fit_parameter( @@ -249,10 +249,10 @@ def fit(self, X, y, sample_weight=None, **fit_params): if _routing_enabled(): routed_params = process_routing( - obj=self, - method="fit", - other_params=fit_params, + self, + "fit", sample_weight=sample_weight, + **fit_params, ) else: if sample_weight is not None and not has_fit_parameter( @@ -706,9 +706,7 @@ def fit(self, X, Y, **fit_params): del Y_pred_chain if _routing_enabled(): - routed_params = process_routing( - obj=self, method="fit", other_params=fit_params - ) + routed_params = process_routing(self, "fit", **fit_params) else: routed_params = Bunch(estimator=Bunch(fit=fit_params)) diff --git a/sklearn/neighbors/_ball_tree.pyx b/sklearn/neighbors/_ball_tree.pyx deleted file mode 100644 index d9b933cb43c66..0000000000000 --- a/sklearn/neighbors/_ball_tree.pyx +++ /dev/null @@ -1,195 +0,0 @@ -# Author: Jake Vanderplas -# License: BSD 3 clause - -__all__ = ['BallTree'] - -DOC_DICT = {'BinaryTree': 'BallTree', 'binary_tree': 'ball_tree'} - -VALID_METRICS = [ - 'BrayCurtisDistance64', - 'CanberraDistance64', - 'ChebyshevDistance64', - 'DiceDistance64', - 'EuclideanDistance64', - 'HammingDistance64', - 'HaversineDistance64', - 'JaccardDistance64', - 'MahalanobisDistance64', - 'ManhattanDistance64', - 'MinkowskiDistance64', - 'PyFuncDistance64', - 'RogersTanimotoDistance64', - 'RussellRaoDistance64', - 'SEuclideanDistance64', - 'SokalMichenerDistance64', - 'SokalSneathDistance64', - 'WMinkowskiDistance64', -] - -include "_binary_tree.pxi" - -# Inherit BallTree from BinaryTree -cdef class BallTree(BinaryTree): - __doc__ = CLASS_DOC.format(**DOC_DICT) - pass - - -# ---------------------------------------------------------------------- -# The functions below specialized the Binary Tree as a Ball Tree -# -# Note that these functions use the concept of "reduced distance". -# The reduced distance, defined for some metrics, is a quantity which -# is more efficient to compute than the distance, but preserves the -# relative rankings of the true distance. For example, the reduced -# distance for the Euclidean metric is the squared-euclidean distance. -# For some metrics, the reduced distance is simply the distance. - -cdef int allocate_data(BinaryTree tree, intp_t n_nodes, - intp_t n_features) except -1: - """Allocate arrays needed for the KD Tree""" - tree.node_bounds = np.zeros((1, n_nodes, n_features), dtype=np.float64) - return 0 - - -cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, intp_t i_node, - intp_t idx_start, intp_t idx_end) except -1: - """Initialize the node for the dataset stored in tree.data""" - cdef intp_t n_features = tree.data.shape[1] - cdef intp_t n_points = idx_end - idx_start - - cdef intp_t i, j - cdef float64_t radius - cdef float64_t *this_pt - - cdef intp_t* idx_array = &tree.idx_array[0] - cdef float64_t* data = &tree.data[0, 0] - cdef float64_t* centroid = &tree.node_bounds[0, i_node, 0] - - cdef bint with_sample_weight = tree.sample_weight is not None - cdef float64_t* sample_weight - cdef float64_t sum_weight_node - if with_sample_weight: - sample_weight = &tree.sample_weight[0] - - # determine Node centroid - for j in range(n_features): - centroid[j] = 0 - - if with_sample_weight: - sum_weight_node = 0 - for i in range(idx_start, idx_end): - sum_weight_node += sample_weight[idx_array[i]] - this_pt = data + n_features * idx_array[i] - for j from 0 <= j < n_features: - centroid[j] += this_pt[j] * sample_weight[idx_array[i]] - - for j in range(n_features): - centroid[j] /= sum_weight_node - else: - for i in range(idx_start, idx_end): - this_pt = data + n_features * idx_array[i] - for j from 0 <= j < n_features: - centroid[j] += this_pt[j] - - for j in range(n_features): - centroid[j] /= n_points - - # determine Node radius - radius = 0 - for i in range(idx_start, idx_end): - radius = fmax(radius, - tree.rdist(centroid, - data + n_features * idx_array[i], - n_features)) - - node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius) - node_data[i_node].idx_start = idx_start - node_data[i_node].idx_end = idx_end - return 0 - - -cdef inline float64_t min_dist(BinaryTree tree, intp_t i_node, - float64_t* pt) except -1 nogil: - """Compute the minimum distance between a point and a node""" - cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], - tree.data.shape[1]) - return fmax(0, dist_pt - tree.node_data[i_node].radius) - - -cdef inline float64_t max_dist(BinaryTree tree, intp_t i_node, - float64_t* pt) except -1: - """Compute the maximum distance between a point and a node""" - cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], - tree.data.shape[1]) - return dist_pt + tree.node_data[i_node].radius - - -cdef inline int min_max_dist(BinaryTree tree, intp_t i_node, float64_t* pt, - float64_t* min_dist, float64_t* max_dist) except -1 nogil: - """Compute the minimum and maximum distance between a point and a node""" - cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], - tree.data.shape[1]) - cdef float64_t rad = tree.node_data[i_node].radius - min_dist[0] = fmax(0, dist_pt - rad) - max_dist[0] = dist_pt + rad - return 0 - - -cdef inline float64_t min_rdist(BinaryTree tree, intp_t i_node, - float64_t* pt) except -1 nogil: - """Compute the minimum reduced-distance between a point and a node""" - if tree.euclidean: - return euclidean_dist_to_rdist64(min_dist(tree, i_node, pt)) - else: - return tree.dist_metric._dist_to_rdist(min_dist(tree, i_node, pt)) - - -cdef inline float64_t max_rdist(BinaryTree tree, intp_t i_node, - float64_t* pt) except -1: - """Compute the maximum reduced-distance between a point and a node""" - if tree.euclidean: - return euclidean_dist_to_rdist64(max_dist(tree, i_node, pt)) - else: - return tree.dist_metric._dist_to_rdist(max_dist(tree, i_node, pt)) - - -cdef inline float64_t min_dist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: - """compute the minimum distance between two nodes""" - cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], - &tree1.node_bounds[0, i_node1, 0], - tree1.data.shape[1]) - return fmax(0, (dist_pt - tree1.node_data[i_node1].radius - - tree2.node_data[i_node2].radius)) - - -cdef inline float64_t max_dist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: - """compute the maximum distance between two nodes""" - cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], - &tree1.node_bounds[0, i_node1, 0], - tree1.data.shape[1]) - return (dist_pt + tree1.node_data[i_node1].radius - + tree2.node_data[i_node2].radius) - - -cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: - """compute the minimum reduced distance between two nodes""" - if tree1.euclidean: - return euclidean_dist_to_rdist64(min_dist_dual(tree1, i_node1, - tree2, i_node2)) - else: - return tree1.dist_metric._dist_to_rdist(min_dist_dual(tree1, i_node1, - tree2, i_node2)) - - -cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: - """compute the maximum reduced distance between two nodes""" - if tree1.euclidean: - return euclidean_dist_to_rdist64(max_dist_dual(tree1, i_node1, - tree2, i_node2)) - else: - return tree1.dist_metric._dist_to_rdist(max_dist_dual(tree1, i_node1, - tree2, i_node2)) diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp new file mode 100644 index 0000000000000..92b26714e5d9f --- /dev/null +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -0,0 +1,284 @@ +{{py: + +# Generated file: _ball_tree.pyx + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + # + ('64', 'float64_t', 'np.float64'), + ('32', 'float32_t', 'np.float32') +] + +# Author: Jake Vanderplas +# License: BSD 3 clause + +}} + + +__all__ = ['BallTree', 'BallTree64', 'BallTree32'] + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +DOC_DICT{{name_suffix}} = { + 'BinaryTree': 'BallTree{{name_suffix}}', + 'binary_tree': 'ball_tree{{name_suffix}}', +} + +VALID_METRICS{{name_suffix}} = [ + 'BrayCurtisDistance{{name_suffix}}', + 'CanberraDistance{{name_suffix}}', + 'ChebyshevDistance{{name_suffix}}', + 'DiceDistance{{name_suffix}}', + 'EuclideanDistance{{name_suffix}}', + 'HammingDistance{{name_suffix}}', + 'HaversineDistance{{name_suffix}}', + 'JaccardDistance{{name_suffix}}', + 'MahalanobisDistance{{name_suffix}}', + 'ManhattanDistance{{name_suffix}}', + 'MinkowskiDistance{{name_suffix}}', + 'PyFuncDistance{{name_suffix}}', + 'RogersTanimotoDistance{{name_suffix}}', + 'RussellRaoDistance{{name_suffix}}', + 'SEuclideanDistance{{name_suffix}}', + 'SokalMichenerDistance{{name_suffix}}', + 'SokalSneathDistance{{name_suffix}}', + 'WMinkowskiDistance{{name_suffix}}', +] + +{{endfor}} + +include "_binary_tree.pxi" + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +# Inherit BallTree{{name_suffix}} from BinaryTree{{name_suffix}} +cdef class BallTree{{name_suffix}}(BinaryTree{{name_suffix}}): + __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}}) + pass + +{{endfor}} + + +#---------------------------------------------------------------------- +# The functions below specialized the Binary Tree as a Ball Tree +# +# Note that these functions use the concept of "reduced distance". +# The reduced distance, defined for some metrics, is a quantity which +# is more efficient to compute than the distance, but preserves the +# relative rankings of the true distance. For example, the reduced +# distance for the Euclidean metric is the squared-euclidean distance. +# For some metrics, the reduced distance is simply the distance. + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +cdef int allocate_data{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t n_nodes, + intp_t n_features, +) except -1: + """Allocate arrays needed for the KD Tree""" + tree.node_bounds = np.zeros((1, n_nodes, n_features), dtype={{INPUT_DTYPE}}) + return 0 + + +cdef int init_node{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + NodeData_t[::1] node_data, + intp_t i_node, + intp_t idx_start, + intp_t idx_end, +) except -1: + """Initialize the node for the dataset stored in tree.data""" + cdef intp_t n_features = tree.data.shape[1] + cdef intp_t n_points = idx_end - idx_start + + cdef intp_t i, j + cdef float64_t radius + cdef {{INPUT_DTYPE_t}} *this_pt + + cdef intp_t* idx_array = &tree.idx_array[0] + cdef {{INPUT_DTYPE_t}}* data = &tree.data[0, 0] + cdef {{INPUT_DTYPE_t}}* centroid = &tree.node_bounds[0, i_node, 0] + + cdef bint with_sample_weight = tree.sample_weight is not None + cdef {{INPUT_DTYPE_t}}* sample_weight + cdef float64_t sum_weight_node + if with_sample_weight: + sample_weight = &tree.sample_weight[0] + + # determine Node centroid + for j in range(n_features): + centroid[j] = 0 + + if with_sample_weight: + sum_weight_node = 0 + for i in range(idx_start, idx_end): + sum_weight_node += sample_weight[idx_array[i]] + this_pt = data + n_features * idx_array[i] + for j from 0 <= j < n_features: + centroid[j] += this_pt[j] * sample_weight[idx_array[i]] + + for j in range(n_features): + centroid[j] /= sum_weight_node + else: + for i in range(idx_start, idx_end): + this_pt = data + n_features * idx_array[i] + for j from 0 <= j < n_features: + centroid[j] += this_pt[j] + + for j in range(n_features): + centroid[j] /= n_points + + # determine Node radius + radius = 0 + for i in range(idx_start, idx_end): + radius = fmax(radius, + tree.rdist(centroid, + data + n_features * idx_array[i], + n_features)) + + node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius) + node_data[i_node].idx_start = idx_start + node_data[i_node].idx_end = idx_end + return 0 + + +cdef inline float64_t min_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: + """Compute the minimum distance between a point and a node""" + cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + return fmax(0, dist_pt - tree.node_data[i_node].radius) + + +cdef inline float64_t max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the maximum distance between a point and a node""" + cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + return dist_pt + tree.node_data[i_node].radius + + +cdef inline int min_max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + float64_t* min_dist, + float64_t* max_dist, +) except -1 nogil: + """Compute the minimum and maximum distance between a point and a node""" + cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + cdef float64_t rad = tree.node_data[i_node].radius + min_dist[0] = fmax(0, dist_pt - rad) + max_dist[0] = dist_pt + rad + return 0 + + +cdef inline float64_t min_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: + """Compute the minimum reduced-distance between a point and a node""" + if tree.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + min_dist{{name_suffix}}(tree, i_node, pt) + ) + else: + return tree.dist_metric._dist_to_rdist( + min_dist{{name_suffix}}(tree, i_node, pt) + ) + + +cdef inline float64_t max_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the maximum reduced-distance between a point and a node""" + if tree.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + max_dist{{name_suffix}}(tree, i_node, pt) + ) + else: + return tree.dist_metric._dist_to_rdist( + max_dist{{name_suffix}}(tree, i_node, pt) + ) + + +cdef inline float64_t min_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the minimum distance between two nodes""" + cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], + &tree1.node_bounds[0, i_node1, 0], + tree1.data.shape[1]) + return fmax(0, (dist_pt - tree1.node_data[i_node1].radius + - tree2.node_data[i_node2].radius)) + + +cdef inline float64_t max_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the maximum distance between two nodes""" + cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], + &tree1.node_bounds[0, i_node1, 0], + tree1.data.shape[1]) + return (dist_pt + tree1.node_data[i_node1].radius + + tree2.node_data[i_node2].radius) + + +cdef inline float64_t min_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the minimum reduced distance between two nodes""" + if tree1.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + else: + return tree1.dist_metric._dist_to_rdist( + min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + + +cdef inline float64_t max_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the maximum reduced distance between two nodes""" + if tree1.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + else: + return tree1.dist_metric._dist_to_rdist( + max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + +{{endfor}} + + +class BallTree(BallTree64): + __doc__ = CLASS_DOC.format(BinaryTree="BallTree") + pass diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index dcff18e10fa48..519db9bead3d3 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -19,7 +19,7 @@ from ..base import BaseEstimator, MultiOutputMixin, is_classifier from ..exceptions import DataConversionWarning, EfficiencyWarning -from ..metrics import pairwise_distances_chunked +from ..metrics import DistanceMetric, pairwise_distances_chunked from ..metrics._pairwise_distances_reduction import ( ArgKmin, RadiusNeighbors, @@ -414,7 +414,11 @@ def _check_algorithm_metric(self): if self.algorithm == "auto": if self.metric == "precomputed": alg_check = "brute" - elif callable(self.metric) or self.metric in VALID_METRICS["ball_tree"]: + elif ( + callable(self.metric) + or self.metric in VALID_METRICS["ball_tree"] + or isinstance(self.metric, DistanceMetric) + ): alg_check = "ball_tree" else: alg_check = "brute" @@ -430,7 +434,9 @@ def _check_algorithm_metric(self): "in very poor performance." % self.metric ) - elif self.metric not in VALID_METRICS[alg_check]: + elif self.metric not in VALID_METRICS[alg_check] and not isinstance( + self.metric, DistanceMetric + ): raise ValueError( "Metric '%s' not valid. Use " "sorted(sklearn.neighbors.VALID_METRICS['%s']) " @@ -563,9 +569,11 @@ def _fit(self, X, y=None): if self.algorithm not in ("auto", "brute"): warnings.warn("cannot use tree with sparse input: using brute force") - if self.effective_metric_ not in VALID_METRICS_SPARSE[ - "brute" - ] and not callable(self.effective_metric_): + if ( + self.effective_metric_ not in VALID_METRICS_SPARSE["brute"] + and not callable(self.effective_metric_) + and not isinstance(self.effective_metric_, DistanceMetric) + ): raise ValueError( "Metric '%s' not valid for sparse input. " "Use sorted(sklearn.neighbors." diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi.tp similarity index 90% rename from sklearn/neighbors/_binary_tree.pxi rename to sklearn/neighbors/_binary_tree.pxi.tp index b60ea3a0a6d70..6322f809f7eb9 100644 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -1,14 +1,32 @@ -#!python +{{py: +# Generated file: _binary_tree.pxi + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE + # + ('64', 'float64_t', 'np.float64', 'cnp.NPY_DOUBLE'), + ('32', 'float32_t', 'np.float32', 'cnp.NPY_FLOAT') +] # KD Tree and Ball Tree # ===================== # # Author: Jake Vanderplas , 2012-2013 +# Omar Salman +# # License: BSD # -# This file is meant to be a literal include in a pyx file. -# See ball_tree.pyx and kd_tree.pyx +# _binary_tree.pxi is generated and is then literally Cython included in +# ball_tree.pyx and kd_tree.pyx. See ball_tree.pyx.tp and kd_tree.pyx.tp. + +}} + + +# KD Tree and Ball Tree +# ===================== # # The routines here are the core algorithms of the KDTree and BallTree # structures. If Cython supported polymorphism, we would be able to @@ -143,6 +161,7 @@ # """Compute the maximum distance between two nodes""" cimport numpy as cnp +from cython cimport floating from libc.math cimport fabs, sqrt, exp, cos, pow, log, lgamma from libc.math cimport fmin, fmax from libc.stdlib cimport calloc, malloc, free @@ -154,15 +173,19 @@ import warnings from ..metrics._dist_metrics cimport ( DistanceMetric, DistanceMetric64, + DistanceMetric32, euclidean_dist64, + euclidean_dist32, euclidean_rdist64, + euclidean_rdist32, euclidean_dist_to_rdist64, + euclidean_dist_to_rdist32, ) from ._partition_nodes cimport partition_node_indices from ..utils import check_array -from ..utils._typedefs cimport float64_t, intp_t +from ..utils._typedefs cimport float32_t, float64_t, intp_t from ..utils._heap cimport heap_push from ..utils._sorting cimport simultaneous_sort as _simultaneous_sort @@ -500,8 +523,9 @@ def kernel_norm(h, d, kernel, return_log=False): else: return np.exp(result) +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} -cdef class NeighborsHeap: +cdef class NeighborsHeap{{name_suffix}}: """A max-heap structure to keep track of distances/indices of neighbors This implements an efficient pre-allocated set of fixed-size heaps @@ -516,19 +540,19 @@ cdef class NeighborsHeap: n_nbrs : int the size of each heap. """ - cdef float64_t[:, ::1] distances + cdef {{INPUT_DTYPE_t}}[:, ::1] distances cdef intp_t[:, ::1] indices def __cinit__(self): # One-element arrays are used as placeholders to prevent # any problem due to potential access to those attributes # (e.g. assigning to NULL or a to value in another segment). - self.distances = np.zeros((1, 1), dtype=np.float64, order='C') + self.distances = np.zeros((1, 1), dtype={{INPUT_DTYPE}}, order='C') self.indices = np.zeros((1, 1), dtype=np.intp, order='C') def __init__(self, n_pts, n_nbrs): self.distances = np.full( - (n_pts, n_nbrs), np.inf, dtype=np.float64, order='C' + (n_pts, n_nbrs), np.inf, dtype={{INPUT_DTYPE}}, order='C' ) self.indices = np.zeros((n_pts, n_nbrs), dtype=np.intp, order='C') @@ -571,14 +595,16 @@ cdef class NeighborsHeap: ) return 0 -# ------------------------------------------------------------ +{{endfor}} + +#------------------------------------------------------------ # find_node_split_dim: # this computes the equivalent of # j_max = np.argmax(np.max(data, 0) - np.min(data, 0)) -cdef intp_t find_node_split_dim(float64_t* data, - intp_t* node_indices, - intp_t n_features, - intp_t n_points) except -1: +cdef intp_t find_node_split_dim(const floating* data, + intp_t* node_indices, + intp_t n_features, + intp_t n_points) except -1: """Find the dimension with the largest spread. Parameters @@ -764,29 +790,31 @@ def newObj(obj): return obj.__new__(obj) +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} + ###################################################################### -# define the reverse mapping of VALID_METRICS +# define the reverse mapping of VALID_METRICS{{name_suffix}} from sklearn.metrics._dist_metrics import get_valid_metric_ids -VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS) +VALID_METRIC_IDS{{name_suffix}} = get_valid_metric_ids(VALID_METRICS{{name_suffix}}) ###################################################################### # Binary Tree class -cdef class BinaryTree: +cdef class BinaryTree{{name_suffix}}: - cdef readonly const float64_t[:, ::1] data - cdef readonly const float64_t[::1] sample_weight + cdef readonly const {{INPUT_DTYPE_t}}[:, ::1] data + cdef readonly const {{INPUT_DTYPE_t}}[::1] sample_weight cdef public float64_t sum_weight cdef public const intp_t[::1] idx_array cdef public const NodeData_t[::1] node_data - cdef public const float64_t[:, :, ::1] node_bounds + cdef public const {{INPUT_DTYPE_t}}[:, :, ::1] node_bounds cdef intp_t leaf_size cdef intp_t n_levels cdef intp_t n_nodes - cdef DistanceMetric64 dist_metric + cdef DistanceMetric{{name_suffix}} dist_metric cdef int euclidean # variables to keep track of building & querying stats @@ -795,7 +823,7 @@ cdef class BinaryTree: cdef int n_splits cdef int n_calls - valid_metrics = VALID_METRIC_IDS + valid_metrics = VALID_METRIC_IDS{{name_suffix}} # Use cinit to initialize all arrays to empty: this will prevent memory # errors and seg-faults in rare cases where __init__ is not called @@ -803,11 +831,11 @@ cdef class BinaryTree: # any problem due to potential access to this attribute # (e.g. assigning to NULL or a to value in another segment). def __cinit__(self): - self.data = np.empty((1, 1), dtype=np.float64, order='C') - self.sample_weight = np.empty(1, dtype=np.float64, order='C') + self.data = np.empty((1, 1), dtype={{INPUT_DTYPE}}, order='C') + self.sample_weight = np.empty(1, dtype={{INPUT_DTYPE}}, order='C') self.idx_array = np.empty(1, dtype=np.intp, order='C') self.node_data = np.empty(1, dtype=NodeData, order='C') - self.node_bounds = np.empty((1, 1, 1), dtype=np.float64) + self.node_bounds = np.empty((1, 1, 1), dtype={{INPUT_DTYPE}}) self.leaf_size = 0 self.n_levels = 0 @@ -823,7 +851,7 @@ cdef class BinaryTree: def __init__(self, data, leaf_size=40, metric='minkowski', sample_weight=None, **kwargs): # validate data - self.data = check_array(data, dtype=np.float64, order='C') + self.data = check_array(data, dtype={{INPUT_DTYPE}}, order='C') if self.data.size == 0: raise ValueError("X is an empty array") @@ -834,15 +862,15 @@ cdef class BinaryTree: raise ValueError("leaf_size must be greater than or equal to 1") self.leaf_size = leaf_size - self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) + self.dist_metric = DistanceMetric.get_metric(metric, dtype={{INPUT_DTYPE}}, **kwargs) self.euclidean = (self.dist_metric.__class__.__name__ - == 'EuclideanDistance64') + == 'EuclideanDistance{{name_suffix}}') metric = self.dist_metric.__class__.__name__ - if metric not in VALID_METRICS: + if metric not in VALID_METRICS{{name_suffix}}: raise ValueError('metric {metric} is not valid for ' '{BinaryTree}'.format(metric=metric, - **DOC_DICT)) + **DOC_DICT{{name_suffix}})) self.dist_metric._validate_data(self.data) # determine number of levels in the tree, and from this @@ -859,7 +887,7 @@ cdef class BinaryTree: self._update_sample_weight(n_samples, sample_weight) # Allocate tree-specific data - allocate_data(self, self.n_nodes, n_features) + allocate_data{{name_suffix}}(self, self.n_nodes, n_features) self._recursive_build( node_data=self.node_data.base, i_node=0, @@ -870,7 +898,7 @@ cdef class BinaryTree: def _update_sample_weight(self, n_samples, sample_weight): if sample_weight is not None: self.sample_weight = np.asarray( - sample_weight, dtype=np.float64, order='C') + sample_weight, dtype={{INPUT_DTYPE}}, order='C') self.sum_weight = np.sum(self.sample_weight) else: self.sample_weight = None @@ -982,17 +1010,17 @@ cdef class BinaryTree: self.node_bounds.base, ) - cdef inline float64_t dist(self, float64_t* x1, float64_t* x2, - intp_t size) except -1 nogil: + cdef inline float64_t dist(self, {{INPUT_DTYPE_t}}* x1, {{INPUT_DTYPE_t}}* x2, + intp_t size) except -1 nogil: """Compute the distance between arrays x1 and x2""" self.n_calls += 1 if self.euclidean: - return euclidean_dist64(x1, x2, size) + return euclidean_dist{{name_suffix}}(x1, x2, size) else: return self.dist_metric.dist(x1, x2, size) - cdef inline float64_t rdist(self, float64_t* x1, float64_t* x2, - intp_t size) except -1 nogil: + cdef inline float64_t rdist(self, {{INPUT_DTYPE_t}}* x1, {{INPUT_DTYPE_t}}* x2, + intp_t size) except -1 nogil: """Compute the reduced distance between arrays x1 and x2. The reduced distance, defined for some metrics, is a quantity which @@ -1002,7 +1030,7 @@ cdef class BinaryTree: """ self.n_calls += 1 if self.euclidean: - return euclidean_rdist64(x1, x2, size) + return euclidean_rdist{{name_suffix}}(x1, x2, size) else: return self.dist_metric.rdist(x1, x2, size) @@ -1023,10 +1051,10 @@ cdef class BinaryTree: cdef intp_t n_points = idx_end - idx_start cdef intp_t n_mid = n_points / 2 cdef intp_t* idx_array = &self.idx_array[idx_start] - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] # initialize node data - init_node(self, node_data, i_node, idx_start, idx_end) + init_node{{name_suffix}}(self, node_data, i_node, idx_start, idx_end) if 2 * i_node + 1 >= self.n_nodes: node_data[i_node].is_leaf = True @@ -1103,7 +1131,7 @@ cdef class BinaryTree: corresponding point. """ # XXX: we should allow X to be a pre-built tree. - X = check_array(X, dtype=np.float64, order='C') + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') if X.shape[X.ndim - 1] != self.data.shape[1]: raise ValueError("query data dimension must " @@ -1115,13 +1143,13 @@ cdef class BinaryTree: # flatten X, and save original shape information np_Xarr = X.reshape((-1, self.data.shape[1])) - cdef const float64_t[:, ::1] Xarr = np_Xarr + cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr cdef float64_t reduced_dist_LB cdef intp_t i - cdef float64_t* pt + cdef {{INPUT_DTYPE_t}}* pt # initialize heap for neighbors - cdef NeighborsHeap heap = NeighborsHeap(Xarr.shape[0], k) + cdef NeighborsHeap{{name_suffix}} heap = NeighborsHeap{{name_suffix}}(Xarr.shape[0], k) # node heap for breadth-first queries cdef NodeHeap nodeheap @@ -1141,7 +1169,7 @@ cdef class BinaryTree: if breadth_first: self._query_dual_breadthfirst(other, heap, nodeheap) else: - reduced_dist_LB = min_rdist_dual(self, 0, other, 0) + reduced_dist_LB = min_rdist_dual{{name_suffix}}(self, 0, other, 0) bounds = np.full(other.node_data.shape[0], np.inf) self._query_dual_depthfirst(0, other, 0, bounds, heap, reduced_dist_LB) @@ -1155,7 +1183,7 @@ cdef class BinaryTree: else: with nogil: for i in range(Xarr.shape[0]): - reduced_dist_LB = min_rdist(self, 0, pt) + reduced_dist_LB = min_rdist{{name_suffix}}(self, 0, pt) self._query_single_depthfirst(0, pt, i, heap, reduced_dist_LB) pt += Xarr.shape[1] @@ -1233,20 +1261,20 @@ cdef class BinaryTree: cdef intp_t i, count_i = 0 cdef intp_t n_features = self.data.shape[1] - cdef float64_t[::1] dist_arr_i + cdef {{INPUT_DTYPE_t}}[::1] dist_arr_i cdef intp_t[::1] idx_arr_i, counts - cdef float64_t* pt + cdef {{INPUT_DTYPE_t}}* pt cdef intp_t** indices = NULL - cdef float64_t** distances = NULL + cdef {{INPUT_DTYPE_t}}** distances = NULL # validate X and prepare for query - X = check_array(X, dtype=np.float64, order='C') + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') if X.shape[X.ndim - 1] != self.data.shape[1]: raise ValueError("query data dimension must " "match training data dimension") - cdef const float64_t[:, ::1] Xarr = X.reshape((-1, self.data.shape[1])) + cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = X.reshape((-1, self.data.shape[1])) # prepare r for query r = np.asarray(r, dtype=np.float64, order='C') @@ -1265,7 +1293,7 @@ cdef class BinaryTree: if indices == NULL: raise MemoryError() if return_distance: - distances = calloc(Xarr.shape[0], sizeof(float64_t*)) + distances = <{{INPUT_DTYPE_t}}**>calloc(Xarr.shape[0], sizeof({{INPUT_DTYPE_t}}*)) if distances == NULL: free(indices) raise MemoryError() @@ -1273,7 +1301,7 @@ cdef class BinaryTree: np_idx_arr = np.zeros(self.data.shape[0], dtype=np.intp) idx_arr_i = np_idx_arr - np_dist_arr = np.zeros(self.data.shape[0], dtype=np.float64) + np_dist_arr = np.zeros(self.data.shape[0], dtype={{INPUT_DTYPE}}) dist_arr_i = np_dist_arr counts_arr = np.zeros(Xarr.shape[0], dtype=np.intp) @@ -1306,11 +1334,11 @@ cdef class BinaryTree: if return_distance: # equivalent to: distances[i] = np_dist_arr[:counts[i]].copy() - distances[i] = malloc(counts[i] * sizeof(float64_t)) + distances[i] = <{{INPUT_DTYPE_t}}*>malloc(counts[i] * sizeof({{INPUT_DTYPE_t}})) if distances[i] == NULL: memory_error = True break - memcpy(distances[i], &dist_arr_i[0], counts[i] * sizeof(float64_t)) + memcpy(distances[i], &dist_arr_i[0], counts[i] * sizeof({{INPUT_DTYPE_t}})) try: if memory_error: @@ -1333,7 +1361,7 @@ cdef class BinaryTree: # make a new numpy array that wraps the existing data # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0 - distances_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], cnp.NPY_DOUBLE, distances[i]) + distances_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], {{NPY_TYPE}}, distances[i]) # make sure the data will be freed when the numpy array is garbage collected PyArray_ENABLEFLAGS(distances_npy[i], cnp.NPY_ARRAY_OWNDATA) # make sure the data is not freed twice @@ -1445,18 +1473,18 @@ cdef class BinaryTree: cdef float64_t log_knorm = _log_kernel_norm(h_c, n_features, kernel_c) # validate X and prepare for query - X = check_array(X, dtype=np.float64, order='C') + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') if X.shape[X.ndim - 1] != n_features: raise ValueError("query data dimension must " "match training data dimension") Xarr_np = X.reshape((-1, n_features)) - cdef float64_t[:, ::1] Xarr = Xarr_np + cdef {{INPUT_DTYPE_t}}[:, ::1] Xarr = Xarr_np - log_density_arr = np.zeros(Xarr.shape[0], dtype=np.float64) - cdef float64_t[::1] log_density = log_density_arr + log_density_arr = np.zeros(Xarr.shape[0], dtype={{INPUT_DTYPE}}) + cdef {{INPUT_DTYPE_t}}[::1] log_density = log_density_arr - cdef float64_t* pt = &Xarr[0, 0] + cdef {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0] cdef NodeHeap nodeheap if breadth_first: @@ -1481,7 +1509,7 @@ cdef class BinaryTree: pt += n_features else: for i in range(Xarr.shape[0]): - min_max_dist(self, 0, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, 0, pt, &dist_LB, &dist_UB) # compute max & min bounds on density within top node log_min_bound = (log(self.sum_weight) + compute_log_kernel(dist_UB, @@ -1539,14 +1567,14 @@ cdef class BinaryTree: cdef intp_t i # validate X and prepare for query - X = check_array(X, dtype=np.float64, order='C') + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') if X.shape[X.ndim - 1] != self.data.shape[1]: raise ValueError("query data dimension must " "match training data dimension") np_Xarr = X.reshape((-1, self.data.shape[1])) - cdef float64_t[:, ::1] Xarr = np_Xarr + cdef {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr # prepare r for query r = np.asarray(r, dtype=np.float64, order='C') @@ -1561,7 +1589,7 @@ cdef class BinaryTree: count = np.zeros(r.shape[0], dtype=np.intp) cdef intp_t[::1] carr = count - cdef float64_t* pt = &Xarr[0, 0] + cdef {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0] if dualtree: other = self.__class__(Xarr, metric=self.dist_metric, @@ -1576,17 +1604,21 @@ cdef class BinaryTree: return count - cdef int _query_single_depthfirst(self, intp_t i_node, - float64_t* pt, intp_t i_pt, - NeighborsHeap heap, - float64_t reduced_dist_LB) except -1 nogil: + cdef int _query_single_depthfirst( + self, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + intp_t i_pt, + NeighborsHeap{{name_suffix}} heap, + float64_t reduced_dist_LB, + ) except -1 nogil: """Recursive Single-tree k-neighbors query, depth-first approach""" cdef NodeData_t node_info = self.node_data[i_node] cdef float64_t dist_pt, reduced_dist_LB_1, reduced_dist_LB_2 cdef intp_t i, i1, i2 - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] # ------------------------------------------------------------ # Case 1: query point is outside node radius: @@ -1611,8 +1643,8 @@ cdef class BinaryTree: self.n_splits += 1 i1 = 2 * i_node + 1 i2 = i1 + 1 - reduced_dist_LB_1 = min_rdist(self, i1, pt) - reduced_dist_LB_2 = min_rdist(self, i2, pt) + reduced_dist_LB_1 = min_rdist{{name_suffix}}(self, i1, pt) + reduced_dist_LB_2 = min_rdist{{name_suffix}}(self, i2, pt) # recursively query subnodes if reduced_dist_LB_1 <= reduced_dist_LB_2: @@ -1627,19 +1659,22 @@ cdef class BinaryTree: reduced_dist_LB_1) return 0 - cdef int _query_single_breadthfirst(self, float64_t* pt, - intp_t i_pt, - NeighborsHeap heap, - NodeHeap nodeheap) except -1: + cdef int _query_single_breadthfirst( + self, + {{INPUT_DTYPE_t}}* pt, + intp_t i_pt, + NeighborsHeap{{name_suffix}} heap, + NodeHeap nodeheap, + ) except -1: """Non-recursive single-tree k-neighbors query, breadth-first search""" cdef intp_t i, i_node cdef float64_t dist_pt, reduced_dist_LB cdef NodeData_t* node_data = &self.node_data[0] - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] # Set up the node heap and push the head node onto it cdef NodeHeapData_t nodeheap_item - nodeheap_item.val = min_rdist(self, 0, pt) + nodeheap_item.val = min_rdist{{name_suffix}}(self, 0, pt) nodeheap_item.i1 = 0 nodeheap.push(nodeheap_item) @@ -1672,15 +1707,19 @@ cdef class BinaryTree: self.n_splits += 1 for i in range(2 * i_node + 1, 2 * i_node + 3): nodeheap_item.i1 = i - nodeheap_item.val = min_rdist(self, i, pt) + nodeheap_item.val = min_rdist{{name_suffix}}(self, i, pt) nodeheap.push(nodeheap_item) return 0 - cdef int _query_dual_depthfirst(self, intp_t i_node1, - BinaryTree other, intp_t i_node2, - float64_t[::1] bounds, - NeighborsHeap heap, - float64_t reduced_dist_LB) except -1: + cdef int _query_dual_depthfirst( + self, + intp_t i_node1, + BinaryTree{{name_suffix}} other, + intp_t i_node2, + float64_t[::1] bounds, + NeighborsHeap{{name_suffix}} heap, + float64_t reduced_dist_LB, + ) except -1: """Recursive dual-tree k-neighbors query, depth-first""" # note that the array `bounds` is maintained such that # bounds[i] is the largest distance among any of the @@ -1688,8 +1727,8 @@ cdef class BinaryTree: cdef NodeData_t node_info1 = self.node_data[i_node1] cdef NodeData_t node_info2 = other.node_data[i_node2] - cdef float64_t* data1 = &self.data[0, 0] - cdef float64_t* data2 = &other.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] cdef intp_t n_features = self.data.shape[1] cdef float64_t bound_max, dist_pt, reduced_dist_LB1, reduced_dist_LB2 @@ -1740,9 +1779,9 @@ cdef class BinaryTree: # recursively query, starting with the nearest subnode elif node_info1.is_leaf or (not node_info2.is_leaf and node_info2.radius > node_info1.radius): - reduced_dist_LB1 = min_rdist_dual(self, i_node1, + reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, i_node1, other, 2 * i_node2 + 1) - reduced_dist_LB2 = min_rdist_dual(self, i_node1, + reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, i_node1, other, 2 * i_node2 + 2) if reduced_dist_LB1 < reduced_dist_LB2: @@ -1760,9 +1799,9 @@ cdef class BinaryTree: # Case 3b: node 2 is a leaf or is smaller: split node 1 and # recursively query, starting with the nearest subnode else: - reduced_dist_LB1 = min_rdist_dual(self, 2 * i_node1 + 1, + reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 1, other, i_node2) - reduced_dist_LB2 = min_rdist_dual(self, 2 * i_node1 + 2, + reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 2, other, i_node2) if reduced_dist_LB1 < reduced_dist_LB2: @@ -1777,9 +1816,12 @@ cdef class BinaryTree: bounds, heap, reduced_dist_LB1) return 0 - cdef int _query_dual_breadthfirst(self, BinaryTree other, - NeighborsHeap heap, - NodeHeap nodeheap) except -1: + cdef int _query_dual_breadthfirst( + self, + BinaryTree{{name_suffix}} other, + NeighborsHeap{{name_suffix}} heap, + NodeHeap nodeheap, + ) except -1: """Non-recursive dual-tree k-neighbors query, breadth-first""" cdef intp_t i, i1, i2, i_node1, i_node2, i_pt cdef float64_t dist_pt, reduced_dist_LB @@ -1787,13 +1829,13 @@ cdef class BinaryTree: cdef NodeData_t* node_data1 = &self.node_data[0] cdef NodeData_t* node_data2 = &other.node_data[0] cdef NodeData_t node_info1, node_info2 - cdef float64_t* data1 = &self.data[0, 0] - cdef float64_t* data2 = &other.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] cdef intp_t n_features = self.data.shape[1] # Set up the node heap and push the head nodes onto it cdef NodeHeapData_t nodeheap_item - nodeheap_item.val = min_rdist_dual(self, 0, other, 0) + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, 0, other, 0) nodeheap_item.i1 = 0 nodeheap_item.i2 = 0 nodeheap.push(nodeheap_item) @@ -1845,7 +1887,7 @@ cdef class BinaryTree: nodeheap_item.i1 = i_node1 for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3): nodeheap_item.i2 = i2 - nodeheap_item.val = min_rdist_dual(self, i_node1, + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i_node1, other, i2) nodeheap.push(nodeheap_item) @@ -1856,21 +1898,24 @@ cdef class BinaryTree: nodeheap_item.i2 = i_node2 for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3): nodeheap_item.i1 = i1 - nodeheap_item.val = min_rdist_dual(self, i1, + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i1, other, i_node2) nodeheap.push(nodeheap_item) return 0 - cdef intp_t _query_radius_single(self, - intp_t i_node, - float64_t* pt, float64_t r, - intp_t* indices, - float64_t* distances, - intp_t count, - int count_only, - int return_distance) noexcept nogil: + cdef intp_t _query_radius_single( + self, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + float64_t r, + intp_t* indices, + {{INPUT_DTYPE_t}}* distances, + intp_t count, + int count_only, + int return_distance, + ) noexcept nogil: """recursive single-tree radius query, depth-first""" - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef intp_t* idx_array = &self.idx_array[0] cdef intp_t n_features = self.data.shape[1] cdef NodeData_t node_info = self.node_data[i_node] @@ -1879,7 +1924,7 @@ cdef class BinaryTree: cdef float64_t reduced_r cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0 - min_max_dist(self, i_node, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB) # ------------------------------------------------------------ # Case 1: all node points are outside distance r. @@ -1937,13 +1982,17 @@ cdef class BinaryTree: return count - cdef float64_t _kde_single_breadthfirst(self, float64_t* pt, - KernelType kernel, float64_t h, - float64_t log_knorm, - float64_t log_atol, float64_t log_rtol, - NodeHeap nodeheap, - float64_t* node_log_min_bounds, - float64_t* node_log_bound_spreads): + cdef float64_t _kde_single_breadthfirst( + self, {{INPUT_DTYPE_t}}* pt, + KernelType kernel, + float64_t h, + float64_t log_knorm, + float64_t log_atol, + float64_t log_rtol, + NodeHeap nodeheap, + float64_t* node_log_min_bounds, + float64_t* node_log_bound_spreads, + ): """non-recursive single-tree kernel density estimation""" # For the given point, node_log_min_bounds and node_log_bound_spreads # will encode the current bounds on the density between the point @@ -1957,9 +2006,9 @@ cdef class BinaryTree: cdef float64_t global_log_min_bound, global_log_bound_spread cdef float64_t global_log_max_bound - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef bint with_sample_weight = self.sample_weight is not None - cdef float64_t* sample_weight + cdef {{INPUT_DTYPE_t}}* sample_weight if with_sample_weight: sample_weight = &self.sample_weight[0] cdef intp_t* idx_array = &self.idx_array[0] @@ -1981,13 +2030,13 @@ cdef class BinaryTree: # push the top node to the heap cdef NodeHeapData_t nodeheap_item - nodeheap_item.val = min_dist(self, 0, pt) + nodeheap_item.val = min_dist{{name_suffix}}(self, 0, pt) nodeheap_item.i1 = 0 nodeheap.push(nodeheap_item) - global_log_min_bound = log(N) + compute_log_kernel(max_dist(self, - 0, pt), - h, kernel) + global_log_min_bound = log(N) + compute_log_kernel( + max_dist{{name_suffix}}(self, 0, pt), h, kernel + ) global_log_max_bound = log(N) + compute_log_kernel(nodeheap_item.val, h, kernel) global_log_bound_spread = logsubexp(global_log_max_bound, @@ -2056,8 +2105,8 @@ cdef class BinaryTree: N1 = node_data[i1].idx_end - node_data[i1].idx_start N2 = node_data[i2].idx_end - node_data[i2].idx_start - min_max_dist(self, i1, pt, &dist_LB_1, &dist_UB_1) - min_max_dist(self, i2, pt, &dist_LB_2, &dist_UB_2) + min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB_1, &dist_UB_1) + min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB_2, &dist_UB_2) node_log_min_bounds[i1] = (log(N1) + compute_log_kernel(dist_UB_1, @@ -2102,14 +2151,19 @@ cdef class BinaryTree: global_log_bound_spread - log(2)) cdef int _kde_single_depthfirst( - self, intp_t i_node, float64_t* pt, - KernelType kernel, float64_t h, - float64_t log_knorm, - float64_t log_atol, float64_t log_rtol, - float64_t local_log_min_bound, - float64_t local_log_bound_spread, - float64_t* global_log_min_bound, - float64_t* global_log_bound_spread) except -1: + self, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + KernelType kernel, + float64_t h, + float64_t log_knorm, + float64_t log_atol, + float64_t log_rtol, + float64_t local_log_min_bound, + float64_t local_log_bound_spread, + float64_t* global_log_min_bound, + float64_t* global_log_bound_spread, + ) except -1: """recursive single-tree kernel density estimate, depth-first""" # For the given point, local_min_bound and local_max_bound give the # minimum and maximum density for the current node, while @@ -2119,10 +2173,10 @@ cdef class BinaryTree: cdef intp_t i, i1, i2, iw, start, end cdef float64_t N1, N2 - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef NodeData_t* node_data = &self.node_data[0] cdef bint with_sample_weight = self.sample_weight is not None - cdef float64_t* sample_weight + cdef {{INPUT_DTYPE_t}}* sample_weight cdef float64_t log_weight if with_sample_weight: sample_weight = &self.sample_weight[0] @@ -2194,7 +2248,7 @@ cdef class BinaryTree: N1 = (self.node_data[i1].idx_end - self.node_data[i1].idx_start) N2 = (self.node_data[i2].idx_end - self.node_data[i2].idx_start) - min_max_dist(self, i1, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB, &dist_UB) child1_log_min_bound = log(N1) + compute_log_kernel(dist_UB, h, kernel) child1_log_bound_spread = logsubexp(log(N1) + @@ -2202,7 +2256,7 @@ cdef class BinaryTree: kernel), child1_log_min_bound) - min_max_dist(self, i2, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB, &dist_UB) child2_log_min_bound = log(N2) + compute_log_kernel(dist_UB, h, kernel) child2_log_bound_spread = logsubexp(log(N2) + @@ -2238,11 +2292,17 @@ cdef class BinaryTree: global_log_bound_spread) return 0 - cdef int _two_point_single(self, intp_t i_node, float64_t* pt, float64_t* r, - intp_t* count, intp_t i_min, - intp_t i_max) except -1: + cdef int _two_point_single( + self, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + float64_t* r, + intp_t* count, + intp_t i_min, + intp_t i_max, + ) except -1: """recursive single-tree two-point correlation function query""" - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef intp_t* idx_array = &self.idx_array[0] cdef intp_t n_features = self.data.shape[1] cdef NodeData_t node_info = self.node_data[i_node] @@ -2251,7 +2311,7 @@ cdef class BinaryTree: cdef float64_t reduced_r cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0 - min_max_dist(self, i_node, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB) # ------------------------------------------------------------ # Go through bounds and check for cuts @@ -2287,13 +2347,19 @@ cdef class BinaryTree: count, i_min, i_max) return 0 - cdef int _two_point_dual(self, intp_t i_node1, - BinaryTree other, intp_t i_node2, - float64_t* r, intp_t* count, - intp_t i_min, intp_t i_max) except -1: + cdef int _two_point_dual( + self, + intp_t i_node1, + BinaryTree{{name_suffix}} other, + intp_t i_node2, + float64_t* r, + intp_t* count, + intp_t i_min, + intp_t i_max, + ) except -1: """recursive dual-tree two-point correlation function query""" - cdef float64_t* data1 = &self.data[0, 0] - cdef float64_t* data2 = &other.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] cdef intp_t* idx_array1 = &self.idx_array[0] cdef intp_t* idx_array2 = &other.idx_array[0] cdef NodeData_t node_info1 = self.node_data[i_node1] @@ -2305,8 +2371,8 @@ cdef class BinaryTree: cdef float64_t reduced_r cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0 - dist_LB = min_dist_dual(self, i_node1, other, i_node2) - dist_UB = max_dist_dual(self, i_node1, other, i_node2) + dist_LB = min_dist_dual{{name_suffix}}(self, i_node1, other, i_node2) + dist_UB = max_dist_dual{{name_suffix}}(self, i_node1, other, i_node2) # ------------------------------------------------------------ # Go through bounds and check for cuts @@ -2359,21 +2425,11 @@ cdef class BinaryTree: r, count, i_min, i_max) return 0 +{{endfor}} ###################################################################### # Python functions for benchmarking and testing C implementations -def load_heap(float64_t[:, ::1] X, intp_t k): - """test fully loading the heap""" - assert k <= X.shape[1] - cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) - cdef intp_t i, j - for i in range(X.shape[0]): - for j in range(X.shape[1]): - heap._push(i, X[i, j], j) - return heap.get_arrays() - - def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices): """In-place simultaneous sort the given row of the arrays @@ -2412,10 +2468,12 @@ def nodeheap_sort(float64_t[::1] vals): return np.asarray(vals_sorted), np.asarray(indices) -cdef inline float64_t _total_node_weight(NodeData_t* node_data, - float64_t* sample_weight, - intp_t* idx_array, - intp_t i_node): +cdef inline float64_t _total_node_weight( + NodeData_t* node_data, + const floating* sample_weight, + intp_t* idx_array, + intp_t i_node, +): cdef intp_t i cdef float64_t N = 0.0 for i in range(node_data[i_node].idx_start, node_data[i_node].idx_end): diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index a9b78d6e499c9..0f5bd1439f81c 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -329,8 +329,8 @@ def predict_proba(self, X): self._fit_X, k=self.n_neighbors, weights=self.weights, - labels=self._y, - unique_labels=self.classes_, + Y_labels=self._y, + unique_Y_labels=self.classes_, metric=metric, metric_kwargs=metric_kwargs, # `strategy="parallel_on_X"` has in practice be shown diff --git a/sklearn/neighbors/_kd_tree.pyx b/sklearn/neighbors/_kd_tree.pyx.tp similarity index 65% rename from sklearn/neighbors/_kd_tree.pyx rename to sklearn/neighbors/_kd_tree.pyx.tp index f5cd2617be147..1006ec2a8398c 100644 --- a/sklearn/neighbors/_kd_tree.pyx +++ b/sklearn/neighbors/_kd_tree.pyx.tp @@ -1,22 +1,52 @@ +{{py: + +# Generated file: _kd_tree.pyx + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + # + ('64', 'float64_t', 'np.float64'), + ('32', 'float32_t', 'np.float32') +] + # By Jake Vanderplas (2013) # written for the scikit-learn project # License: BSD -__all__ = ['KDTree'] +}} + -DOC_DICT = {'BinaryTree': 'KDTree', 'binary_tree': 'kd_tree'} +__all__ = ['KDTree', 'KDTree64', 'KDTree32'] -VALID_METRICS = ['EuclideanDistance64', 'ManhattanDistance64', - 'ChebyshevDistance64', 'MinkowskiDistance64'] +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} +DOC_DICT{{name_suffix}} = { + 'BinaryTree': 'KDTree{{name_suffix}}', + 'binary_tree': 'kd_tree{{name_suffix}}', +} + +VALID_METRICS{{name_suffix}} = [ + 'EuclideanDistance{{name_suffix}}', + 'ManhattanDistance{{name_suffix}}', + 'ChebyshevDistance{{name_suffix}}', + 'MinkowskiDistance{{name_suffix}}' +] + +{{endfor}} include "_binary_tree.pxi" -# Inherit KDTree from BinaryTree -cdef class KDTree(BinaryTree): - __doc__ = CLASS_DOC.format(**DOC_DICT) +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +# Inherit KDTree{{name_suffix}} from BinaryTree{{name_suffix}} +cdef class KDTree{{name_suffix}}(BinaryTree{{name_suffix}}): + __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}}) pass +{{endfor}} + # ---------------------------------------------------------------------- # The functions below specialized the Binary Tree as a KD Tree @@ -28,27 +58,36 @@ cdef class KDTree(BinaryTree): # distance for the Euclidean metric is the squared-euclidean distance. # For some metrics, the reduced distance is simply the distance. +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} -cdef int allocate_data(BinaryTree tree, intp_t n_nodes, - intp_t n_features) except -1: +cdef int allocate_data{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t n_nodes, + intp_t n_features, +) except -1: """Allocate arrays needed for the KD Tree""" - tree.node_bounds = np.zeros((2, n_nodes, n_features), dtype=np.float64) + tree.node_bounds = np.zeros((2, n_nodes, n_features), dtype={{INPUT_DTYPE}}) return 0 -cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, intp_t i_node, - intp_t idx_start, intp_t idx_end) except -1: +cdef int init_node{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + NodeData_t[::1] node_data, + intp_t i_node, + intp_t idx_start, + intp_t idx_end, +) except -1: """Initialize the node for the dataset stored in tree.data""" cdef intp_t n_features = tree.data.shape[1] cdef intp_t i, j cdef float64_t rad = 0 - cdef float64_t* lower_bounds = &tree.node_bounds[0, i_node, 0] - cdef float64_t* upper_bounds = &tree.node_bounds[1, i_node, 0] - cdef float64_t* data = &tree.data[0, 0] + cdef {{INPUT_DTYPE_t}}* lower_bounds = &tree.node_bounds[0, i_node, 0] + cdef {{INPUT_DTYPE_t}}* upper_bounds = &tree.node_bounds[1, i_node, 0] + cdef {{INPUT_DTYPE_t}}* data = &tree.data[0, 0] cdef intp_t* idx_array = &tree.idx_array[0] - cdef float64_t* data_row + cdef {{INPUT_DTYPE_t}}* data_row # determine Node bounds for j in range(n_features): @@ -81,8 +120,11 @@ cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, intp_t i_node, return 0 -cdef float64_t min_rdist(BinaryTree tree, intp_t i_node, - float64_t* pt) except -1 nogil: +cdef float64_t min_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: """Compute the minimum reduced-distance between a point and a node""" cdef intp_t n_features = tree.data.shape[1] cdef float64_t d, d_lo, d_hi, rdist=0.0 @@ -105,16 +147,26 @@ cdef float64_t min_rdist(BinaryTree tree, intp_t i_node, return rdist -cdef float64_t min_dist(BinaryTree tree, intp_t i_node, float64_t* pt) except -1: +cdef float64_t min_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: """Compute the minimum distance between a point and a node""" if tree.dist_metric.p == INF: - return min_rdist(tree, i_node, pt) + return min_rdist{{name_suffix}}(tree, i_node, pt) else: - return pow(min_rdist(tree, i_node, pt), 1. / tree.dist_metric.p) + return pow( + min_rdist{{name_suffix}}(tree, i_node, pt), + 1. / tree.dist_metric.p + ) -cdef float64_t max_rdist(BinaryTree tree, - intp_t i_node, float64_t* pt) except -1: +cdef float64_t max_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: """Compute the maximum reduced-distance between a point and a node""" cdef intp_t n_features = tree.data.shape[1] @@ -134,16 +186,28 @@ cdef float64_t max_rdist(BinaryTree tree, return rdist -cdef float64_t max_dist(BinaryTree tree, intp_t i_node, float64_t* pt) except -1: +cdef float64_t max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: """Compute the maximum distance between a point and a node""" if tree.dist_metric.p == INF: - return max_rdist(tree, i_node, pt) + return max_rdist{{name_suffix}}(tree, i_node, pt) else: - return pow(max_rdist(tree, i_node, pt), 1. / tree.dist_metric.p) - - -cdef inline int min_max_dist(BinaryTree tree, intp_t i_node, float64_t* pt, - float64_t* min_dist, float64_t* max_dist) except -1 nogil: + return pow( + max_rdist{{name_suffix}}(tree, i_node, pt), + 1. / tree.dist_metric.p + ) + + +cdef inline int min_max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + float64_t* min_dist, + float64_t* max_dist, +) except -1 nogil: """Compute the minimum and maximum distance between a point and a node""" cdef intp_t n_features = tree.data.shape[1] @@ -177,8 +241,12 @@ cdef inline int min_max_dist(BinaryTree tree, intp_t i_node, float64_t* pt, return 0 -cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: +cdef inline float64_t min_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: """Compute the minimum reduced distance between two nodes""" cdef intp_t n_features = tree1.data.shape[1] @@ -208,15 +276,24 @@ cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1, return rdist -cdef inline float64_t min_dist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: +cdef inline float64_t min_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: """Compute the minimum distance between two nodes""" - return tree1.dist_metric._rdist_to_dist(min_rdist_dual(tree1, i_node1, - tree2, i_node2)) + return tree1.dist_metric._rdist_to_dist( + min_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) -cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: +cdef inline float64_t max_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: """Compute the maximum reduced distance between two nodes""" cdef intp_t n_features = tree1.data.shape[1] @@ -240,8 +317,20 @@ cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1, return rdist -cdef inline float64_t max_dist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: +cdef inline float64_t max_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: """Compute the maximum distance between two nodes""" - return tree1.dist_metric._rdist_to_dist(max_rdist_dual(tree1, i_node1, - tree2, i_node2)) + return tree1.dist_metric._rdist_to_dist( + max_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + +{{endfor}} + + +class KDTree(KDTree64): + __doc__ = CLASS_DOC.format(BinaryTree="KDTree") + pass diff --git a/sklearn/neighbors/_partition_nodes.pxd b/sklearn/neighbors/_partition_nodes.pxd index 927fde873ee58..c6a0d4bb975c2 100644 --- a/sklearn/neighbors/_partition_nodes.pxd +++ b/sklearn/neighbors/_partition_nodes.pxd @@ -1,7 +1,8 @@ +from cython cimport floating from ..utils._typedefs cimport float64_t, intp_t cdef int partition_node_indices( - float64_t *data, + floating *data, intp_t *node_indices, intp_t split_dim, intp_t split_index, diff --git a/sklearn/neighbors/_partition_nodes.pyx b/sklearn/neighbors/_partition_nodes.pyx index d293b765ea279..011b024fccb14 100644 --- a/sklearn/neighbors/_partition_nodes.pyx +++ b/sklearn/neighbors/_partition_nodes.pyx @@ -16,6 +16,8 @@ # - https://en.cppreference.com/w/cpp/algorithm/nth_element. # - https://github.com/scikit-learn/scikit-learn/pull/11103 # - https://github.com/scikit-learn/scikit-learn/pull/19473 +from cython cimport floating + cdef extern from *: """ @@ -63,7 +65,7 @@ cdef extern from *: cdef int partition_node_indices( - float64_t *data, + floating *data, intp_t *node_indices, intp_t split_dim, intp_t split_index, diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index b9b7f4030d02c..2897c1ce409e8 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -15,6 +15,7 @@ import numpy as np from ..base import RegressorMixin, _fit_context +from ..metrics import DistanceMetric from ..utils._param_validation import StrOptions from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin, _get_weights @@ -71,7 +72,7 @@ class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase): equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric : str or callable, default='minkowski' + metric : str, DistanceMetric object or callable, default='minkowski' Metric to use for distance computation. Default is "minkowski", which results in the standard Euclidean distance when p = 2. See the documentation of `scipy.spatial.distance @@ -89,6 +90,9 @@ class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase): between those vectors. This works for Scipy's metrics, but is less efficient than passing the metric name as a string. + If metric is a DistanceMetric object, it will be passed directly to + the underlying computation routines. + metric_params : dict, default=None Additional keyword arguments for the metric function. @@ -164,6 +168,7 @@ class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase): **NeighborsBase._parameter_constraints, "weights": [StrOptions({"uniform", "distance"}), callable, None], } + _parameter_constraints["metric"].append(DistanceMetric) _parameter_constraints.pop("radius") def __init__( diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index efca4e491ce01..5263f201f320b 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -2,9 +2,9 @@ import numpy as np import pytest -from numpy.testing import assert_array_almost_equal +from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal -from sklearn.neighbors._ball_tree import BallTree +from sklearn.neighbors._ball_tree import BallTree, BallTree32, BallTree64 from sklearn.utils import check_random_state from sklearn.utils._testing import _convert_container from sklearn.utils.validation import check_array @@ -15,6 +15,13 @@ DIMENSION = 3 +METRICS = { + "euclidean": {}, + "manhattan": {}, + "minkowski": dict(p=3), + "chebyshev": {}, +} + DISCRETE_METRICS = ["hamming", "canberra", "braycurtis"] BOOLEAN_METRICS = [ @@ -26,6 +33,11 @@ "sokalsneath", ] +BALL_TREE_CLASSES = [ + BallTree64, + BallTree32, +] + def brute_force_neighbors(X, Y, k, metric, **kwargs): from sklearn.metrics import DistanceMetric @@ -37,9 +49,14 @@ def brute_force_neighbors(X, Y, k, metric, **kwargs): return dist, ind +def test_BallTree_is_BallTree64_subclass(): + assert issubclass(BallTree, BallTree64) + + @pytest.mark.parametrize("metric", itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS)) @pytest.mark.parametrize("array_type", ["list", "array"]) -def test_ball_tree_query_metrics(metric, array_type): +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_ball_tree_query_metrics(metric, array_type, BallTreeImplementation): rng = check_random_state(0) if metric in BOOLEAN_METRICS: X = rng.random_sample((40, 10)).round(0) @@ -52,31 +69,36 @@ def test_ball_tree_query_metrics(metric, array_type): k = 5 - bt = BallTree(X, leaf_size=1, metric=metric) + bt = BallTreeImplementation(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2) -def test_query_haversine(): +@pytest.mark.parametrize( + "BallTreeImplementation, decimal_tol", zip(BALL_TREE_CLASSES, [6, 5]) +) +def test_query_haversine(BallTreeImplementation, decimal_tol): rng = check_random_state(0) X = 2 * np.pi * rng.random_sample((40, 2)) - bt = BallTree(X, leaf_size=1, metric="haversine") + bt = BallTreeImplementation(X, leaf_size=1, metric="haversine") dist1, ind1 = bt.query(X, k=5) dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine") - assert_array_almost_equal(dist1, dist2) + assert_array_almost_equal(dist1, dist2, decimal=decimal_tol) assert_array_almost_equal(ind1, ind2) -def test_array_object_type(): +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_array_object_type(BallTreeImplementation): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): - BallTree(X) + BallTreeImplementation(X) -def test_bad_pyfunc_metric(): +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_bad_pyfunc_metric(BallTreeImplementation): def wrong_returned_value(x, y): return "1" @@ -86,8 +108,93 @@ def one_arg_func(x): X = np.ones((5, 2)) msg = "Custom distance function must accept two vectors and return a float." with pytest.raises(TypeError, match=msg): - BallTree(X, metric=wrong_returned_value) + BallTreeImplementation(X, metric=wrong_returned_value) msg = "takes 1 positional argument but 2 were given" with pytest.raises(TypeError, match=msg): - BallTree(X, metric=one_arg_func) + BallTreeImplementation(X, metric=one_arg_func) + + +@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) +def test_ball_tree_numerical_consistency(global_random_seed, metric): + # Results on float64 and float32 versions of a dataset must be + # numerically close. + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree( + random_seed=global_random_seed, features=50 + ) + + metric_params = METRICS.get(metric, {}) + bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) + bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) + + # Test consistency with respect to the `query` method + k = 5 + dist_64, ind_64 = bt_64.query(Y_64, k=k) + dist_32, ind_32 = bt_32.query(Y_32, k=k) + assert_allclose(dist_64, dist_32, rtol=1e-5) + assert_equal(ind_64, ind_32) + assert dist_64.dtype == np.float64 + assert dist_32.dtype == np.float32 + + # Test consistency with respect to the `query_radius` method + r = 2.38 + ind_64 = bt_64.query_radius(Y_64, r=r) + ind_32 = bt_32.query_radius(Y_32, r=r) + for _ind64, _ind32 in zip(ind_64, ind_32): + assert_equal(_ind64, _ind32) + + # Test consistency with respect to the `query_radius` method + # with return distances being true + ind_64, dist_64 = bt_64.query_radius(Y_64, r=r, return_distance=True) + ind_32, dist_32 = bt_32.query_radius(Y_32, r=r, return_distance=True) + for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32): + assert_equal(_ind64, _ind32) + assert_allclose(_dist_64, _dist_32, rtol=1e-5) + assert _dist_64.dtype == np.float64 + assert _dist_32.dtype == np.float32 + + +@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) +def test_kernel_density_numerical_consistency(global_random_seed, metric): + # Test consistency with respect to the `kernel_density` method + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) + + metric_params = METRICS.get(metric, {}) + bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) + bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) + + kernel = "gaussian" + h = 0.1 + density64 = bt_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True) + density32 = bt_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True) + assert_allclose(density64, density32, rtol=1e-5) + assert density64.dtype == np.float64 + assert density32.dtype == np.float32 + + +def test_two_point_correlation_numerical_consistency(global_random_seed): + # Test consistency with respect to the `two_point_correlation` method + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) + + bt_64 = BallTree64(X_64, leaf_size=10) + bt_32 = BallTree32(X_32, leaf_size=10) + + r = np.linspace(0, 1, 10) + + counts_64 = bt_64.two_point_correlation(Y_64, r=r, dualtree=True) + counts_32 = bt_32.two_point_correlation(Y_32, r=r, dualtree=True) + assert_allclose(counts_64, counts_32) + + +def get_dataset_for_binary_tree(random_seed, features=3): + rng = np.random.RandomState(random_seed) + _X = rng.rand(100, features) + _Y = rng.rand(5, features) + + X_64 = _X.astype(dtype=np.float64, copy=False) + Y_64 = _Y.astype(dtype=np.float64, copy=False) + + X_32 = _X.astype(dtype=np.float32, copy=False) + Y_32 = _Y.astype(dtype=np.float32, copy=False) + + return X_64, X_32, Y_64, Y_32 diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index 1aee28cc36bd0..749601baaf66f 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -1,30 +1,100 @@ import numpy as np import pytest +from numpy.testing import assert_allclose, assert_equal -from sklearn.neighbors._kd_tree import KDTree +from sklearn.neighbors._kd_tree import KDTree, KDTree32, KDTree64 +from sklearn.neighbors.tests.test_ball_tree import get_dataset_for_binary_tree from sklearn.utils.parallel import Parallel, delayed DIMENSION = 3 METRICS = {"euclidean": {}, "manhattan": {}, "chebyshev": {}, "minkowski": dict(p=3)} +KD_TREE_CLASSES = [ + KDTree64, + KDTree32, +] -def test_array_object_type(): + +def test_KDTree_is_KDTree64_subclass(): + assert issubclass(KDTree, KDTree64) + + +@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES) +def test_array_object_type(BinarySearchTree): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): - KDTree(X) + BinarySearchTree(X) -def test_kdtree_picklable_with_joblib(): +@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES) +def test_kdtree_picklable_with_joblib(BinarySearchTree): """Make sure that KDTree queries work when joblib memmaps. Non-regression test for #21685 and #21228.""" rng = np.random.RandomState(0) X = rng.random_sample((10, 3)) - tree = KDTree(X, leaf_size=2) + tree = BinarySearchTree(X, leaf_size=2) # Call Parallel with max_nbytes=1 to trigger readonly memory mapping that # use to raise "ValueError: buffer source array is read-only" in a previous # version of the Cython code. Parallel(n_jobs=2, max_nbytes=1)(delayed(tree.query)(data) for data in 2 * [X]) + + +@pytest.mark.parametrize("metric", METRICS) +def test_kd_tree_numerical_consistency(global_random_seed, metric): + # Results on float64 and float32 versions of a dataset must be + # numerically close. + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree( + random_seed=global_random_seed, features=50 + ) + + metric_params = METRICS.get(metric, {}) + kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) + kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) + + # Test consistency with respect to the `query` method + k = 4 + dist_64, ind_64 = kd_64.query(Y_64, k=k) + dist_32, ind_32 = kd_32.query(Y_32, k=k) + assert_allclose(dist_64, dist_32, rtol=1e-5) + assert_equal(ind_64, ind_32) + assert dist_64.dtype == np.float64 + assert dist_32.dtype == np.float32 + + # Test consistency with respect to the `query_radius` method + r = 2.38 + ind_64 = kd_64.query_radius(Y_64, r=r) + ind_32 = kd_32.query_radius(Y_32, r=r) + for _ind64, _ind32 in zip(ind_64, ind_32): + assert_equal(_ind64, _ind32) + + # Test consistency with respect to the `query_radius` method + # with return distances being true + ind_64, dist_64 = kd_64.query_radius(Y_64, r=r, return_distance=True) + ind_32, dist_32 = kd_32.query_radius(Y_32, r=r, return_distance=True) + for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32): + assert_equal(_ind64, _ind32) + assert_allclose(_dist_64, _dist_32, rtol=1e-5) + assert _dist_64.dtype == np.float64 + assert _dist_32.dtype == np.float32 + + +@pytest.mark.parametrize("metric", METRICS) +def test_kernel_density_numerical_consistency(global_random_seed, metric): + # Test consistency with respect to the `kernel_density` method + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) + + metric_params = METRICS.get(metric, {}) + kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) + kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) + + kernel = "gaussian" + h = 0.1 + density64 = kd_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True) + density32 = kd_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True) + assert_allclose(density64, density32, rtol=1e-5) + assert density64.dtype == np.float64 + assert density32.dtype == np.float32 diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 405ac3a6d0847..35fc210bea7f3 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -24,6 +24,9 @@ ) from sklearn.base import clone from sklearn.exceptions import DataConversionWarning, EfficiencyWarning, NotFittedError +from sklearn.metrics._dist_metrics import ( + DistanceMetric, +) from sklearn.metrics.pairwise import pairwise_distances from sklearn.metrics.tests.test_dist_metrics import BOOL_METRICS from sklearn.metrics.tests.test_pairwise_distances_reduction import ( @@ -69,6 +72,7 @@ COMMON_VALID_METRICS = sorted( set.intersection(*map(set, neighbors.VALID_METRICS.values())) ) # type: ignore + P = (1, 2, 3, 4, np.inf) JOBLIB_BACKENDS = list(joblib.parallel.BACKENDS.keys()) @@ -76,6 +80,25 @@ neighbors.kneighbors_graph = ignore_warnings(neighbors.kneighbors_graph) neighbors.radius_neighbors_graph = ignore_warnings(neighbors.radius_neighbors_graph) +# A list containing metrics where the string specifies the use of the +# DistanceMetric object directly (as resolved in _parse_metric) +DISTANCE_METRIC_OBJS = ["DM_euclidean"] + + +def _parse_metric(metric: str, dtype=None): + """ + Helper function for properly building a type-specialized DistanceMetric instances. + + Constructs a type-specialized DistanceMetric instance from a string + beginning with "DM_" while allowing a pass-through for other metric-specifying + strings. This is necessary since we wish to parameterize dtype independent of + metric, yet DistanceMetric requires it for construction. + + """ + if metric[:3] == "DM_": + return DistanceMetric.get_metric(metric[3:], dtype=dtype) + return metric + def _generate_test_params_for(metric: str, n_features: int): """Return list of DistanceMetric kwargs for tests.""" @@ -129,7 +152,7 @@ def _weight_func(dist): ], ) @pytest.mark.parametrize("query_is_train", [False, True]) -@pytest.mark.parametrize("metric", COMMON_VALID_METRICS) +@pytest.mark.parametrize("metric", COMMON_VALID_METRICS + DISTANCE_METRIC_OBJS) # type: ignore # noqa def test_unsupervised_kneighbors( global_dtype, n_samples, @@ -143,6 +166,8 @@ def test_unsupervised_kneighbors( # on their common metrics, with and without returning # distances + metric = _parse_metric(metric, global_dtype) + # Redefining the rng locally to use the same generated X local_rng = np.random.RandomState(0) X = local_rng.rand(n_samples, n_features).astype(global_dtype, copy=False) @@ -157,6 +182,12 @@ def test_unsupervised_kneighbors( results = [] for algorithm in ALGORITHMS: + if isinstance(metric, DistanceMetric) and global_dtype == np.float32: + if "tree" in algorithm: # pragma: nocover + pytest.skip( + "Neither KDTree nor BallTree support 32-bit distance metric" + " objects." + ) neigh = neighbors.NearestNeighbors( n_neighbors=n_neighbors, algorithm=algorithm, metric=metric ) @@ -206,7 +237,7 @@ def test_unsupervised_kneighbors( (1000, 5, 100), ], ) -@pytest.mark.parametrize("metric", COMMON_VALID_METRICS) +@pytest.mark.parametrize("metric", COMMON_VALID_METRICS + DISTANCE_METRIC_OBJS) # type: ignore # noqa @pytest.mark.parametrize("n_neighbors, radius", [(1, 100), (50, 500), (100, 1000)]) @pytest.mark.parametrize( "NeighborsMixinSubclass", @@ -230,6 +261,19 @@ def test_neigh_predictions_algorithm_agnosticity( # The different algorithms must return identical predictions results # on their common metrics. + metric = _parse_metric(metric, global_dtype) + if isinstance(metric, DistanceMetric): + if "Classifier" in NeighborsMixinSubclass.__name__: + pytest.skip( + "Metrics of type `DistanceMetric` are not yet supported for" + " classifiers." + ) + if "Radius" in NeighborsMixinSubclass.__name__: + pytest.skip( + "Metrics of type `DistanceMetric` are not yet supported for" + " radius-neighbor estimators." + ) + # Redefining the rng locally to use the same generated X local_rng = np.random.RandomState(0) X = local_rng.rand(n_samples, n_features).astype(global_dtype, copy=False) @@ -244,6 +288,12 @@ def test_neigh_predictions_algorithm_agnosticity( ) for algorithm in ALGORITHMS: + if isinstance(metric, DistanceMetric) and global_dtype == np.float32: + if "tree" in algorithm: # pragma: nocover + pytest.skip( + "Neither KDTree nor BallTree support 32-bit distance metric" + " objects." + ) neigh = NeighborsMixinSubclass(parameter, algorithm=algorithm, metric=metric) neigh.fit(X, y) @@ -985,15 +1035,26 @@ def test_query_equidistant_kth_nn(algorithm): @pytest.mark.parametrize( ["algorithm", "metric"], - [ - ("ball_tree", "euclidean"), - ("kd_tree", "euclidean"), + list( + product( + ("kd_tree", "ball_tree", "brute"), + ("euclidean", *DISTANCE_METRIC_OBJS), + ) + ) + + [ ("brute", "euclidean"), ("brute", "precomputed"), ], ) def test_radius_neighbors_sort_results(algorithm, metric): # Test radius_neighbors[_graph] output when sort_result is True + + metric = _parse_metric(metric, np.float64) + if isinstance(metric, DistanceMetric): + pytest.skip( + "Metrics of type `DistanceMetric` are not yet supported for radius-neighbor" + " estimators." + ) n_samples = 10 rng = np.random.RandomState(42) X = rng.random_sample((n_samples, 4)) @@ -1560,11 +1621,14 @@ def test_nearest_neighbors_validate_params(): neighbors.VALID_METRICS["brute"] ) - set(["pyfunc", *BOOL_METRICS]) - ), + ) + + DISTANCE_METRIC_OBJS, ) def test_neighbors_metrics( global_dtype, metric, n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5 ): + metric = _parse_metric(metric, global_dtype) + # Test computing the neighbors for various metrics algorithms = ["brute", "ball_tree", "kd_tree"] X_train = rng.rand(n_samples, n_features).astype(global_dtype, copy=False) @@ -1574,12 +1638,21 @@ def test_neighbors_metrics( for metric_params in metric_params_list: # Some metric (e.g. Weighted minkowski) are not supported by KDTree - exclude_kd_tree = metric not in neighbors.VALID_METRICS["kd_tree"] or ( - "minkowski" in metric and "w" in metric_params + exclude_kd_tree = ( + False + if isinstance(metric, DistanceMetric) + else metric not in neighbors.VALID_METRICS["kd_tree"] + or ("minkowski" in metric and "w" in metric_params) ) results = {} p = metric_params.pop("p", 2) for algorithm in algorithms: + if isinstance(metric, DistanceMetric) and global_dtype == np.float32: + if "tree" in algorithm: # pragma: nocover + pytest.skip( + "Neither KDTree nor BallTree support 32-bit distance metric" + " objects." + ) neigh = neighbors.NearestNeighbors( n_neighbors=n_neighbors, algorithm=algorithm, @@ -1684,10 +1757,14 @@ def custom_metric(x1, x2): assert_allclose(dist1, dist2) -@pytest.mark.parametrize("metric", neighbors.VALID_METRICS["brute"]) +@pytest.mark.parametrize( + "metric", neighbors.VALID_METRICS["brute"] + DISTANCE_METRIC_OBJS +) def test_valid_brute_metric_for_auto_algorithm( global_dtype, metric, n_samples=20, n_features=12 ): + metric = _parse_metric(metric, global_dtype) + X = rng.rand(n_samples, n_features).astype(global_dtype, copy=False) Xcsr = csr_matrix(X) @@ -2207,3 +2284,22 @@ def test_predict_dataframe(): knn = neighbors.KNeighborsClassifier(n_neighbors=2).fit(X, y) knn.predict(X) + + +def test_nearest_neighbours_works_with_p_less_than_1(): + """Check that NearestNeighbors works with :math:`p \\in (0,1)` when `algorithm` + is `"auto"` or `"brute"` regardless of the dtype of X. + + Non-regression test for issue #26548 + """ + X = np.array([[1.0, 0.0], [0.0, 0.0], [0.0, 1.0]]) + neigh = neighbors.NearestNeighbors( + n_neighbors=3, algorithm="brute", metric_params={"p": 0.5} + ) + neigh.fit(X) + + y = neigh.radius_neighbors(X[0].reshape(1, -1), radius=4, return_distance=False) + assert_allclose(y[0], [0, 1, 2]) + + y = neigh.kneighbors(X[0].reshape(1, -1), return_distance=False) + assert_allclose(y[0], [0, 1, 2]) diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py index 590e72ab785d2..4d8bac12f7423 100644 --- a/sklearn/neighbors/tests/test_neighbors_tree.py +++ b/sklearn/neighbors/tests/test_neighbors_tree.py @@ -13,7 +13,7 @@ kernel_norm, ) from sklearn.neighbors._ball_tree import ( - NeighborsHeap as NeighborsHeapBT, + NeighborsHeap64 as NeighborsHeapBT, ) from sklearn.neighbors._ball_tree import ( nodeheap_sort as nodeheap_sort_bt, @@ -25,7 +25,7 @@ KDTree, ) from sklearn.neighbors._kd_tree import ( - NeighborsHeap as NeighborsHeapKDT, + NeighborsHeap64 as NeighborsHeapKDT, ) from sklearn.neighbors._kd_tree import ( nodeheap_sort as nodeheap_sort_kdt, diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index d6ad0001ad257..d85196e879b45 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -334,9 +334,7 @@ def _log_message(self, step_idx): def _check_method_params(self, method, props, **kwargs): if _routing_enabled(): - routed_params = process_routing( - self, method=method, other_params=props, **kwargs - ) + routed_params = process_routing(self, method, **props, **kwargs) return routed_params else: fit_params_steps = Bunch( @@ -586,7 +584,7 @@ def predict(self, X, **params): return self.steps[-1][1].predict(Xt, **params) # metadata routing enabled - routed_params = process_routing(self, "predict", other_params=params) + routed_params = process_routing(self, "predict", **params) for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt, **routed_params[name].transform) return self.steps[-1][1].predict(Xt, **routed_params[self.steps[-1][0]].predict) @@ -706,7 +704,7 @@ def predict_proba(self, X, **params): return self.steps[-1][1].predict_proba(Xt, **params) # metadata routing enabled - routed_params = process_routing(self, "predict_proba", other_params=params) + routed_params = process_routing(self, "predict_proba", **params) for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt, **routed_params[name].transform) return self.steps[-1][1].predict_proba( @@ -747,7 +745,7 @@ def decision_function(self, X, **params): # not branching here since params is only available if # enable_metadata_routing=True - routed_params = process_routing(self, "decision_function", other_params=params) + routed_params = process_routing(self, "decision_function", **params) Xt = X for _, name, transform in self._iter(with_final=False): @@ -833,7 +831,7 @@ def predict_log_proba(self, X, **params): return self.steps[-1][1].predict_log_proba(Xt, **params) # metadata routing enabled - routed_params = process_routing(self, "predict_log_proba", other_params=params) + routed_params = process_routing(self, "predict_log_proba", **params) for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt, **routed_params[name].transform) return self.steps[-1][1].predict_log_proba( @@ -882,7 +880,7 @@ def transform(self, X, **params): # not branching here since params is only available if # enable_metadata_routing=True - routed_params = process_routing(self, "transform", other_params=params) + routed_params = process_routing(self, "transform", **params) Xt = X for _, name, transform in self._iter(): Xt = transform.transform(Xt, **routed_params[name].transform) @@ -925,7 +923,7 @@ def inverse_transform(self, Xt, **params): # we don't have to branch here, since params is only non-empty if # enable_metadata_routing=True. - routed_params = process_routing(self, "inverse_transform", other_params=params) + routed_params = process_routing(self, "inverse_transform", **params) reverse_iter = reversed(list(self._iter())) for _, name, transform in reverse_iter: Xt = transform.inverse_transform( @@ -981,7 +979,7 @@ def score(self, X, y=None, sample_weight=None, **params): # metadata routing is enabled. routed_params = process_routing( - self, "score", sample_weight=sample_weight, other_params=params + self, "score", sample_weight=sample_weight, **params ) Xt = X @@ -1108,7 +1106,7 @@ def get_metadata_routing(self): router = MetadataRouter(owner=self.__class__.__name__) # first we add all steps except the last one - for _, name, trans in self._iter(with_final=False): + for _, name, trans in self._iter(with_final=False, filter_passthrough=True): method_mapping = MethodMapping() # fit, fit_predict, and fit_transform call fit_transform if it # exists, or else fit and transform @@ -1142,7 +1140,7 @@ def get_metadata_routing(self): router.add(method_mapping=method_mapping, **{name: trans}) final_name, final_est = self.steps[-1] - if not final_est: + if final_est is None or final_est == "passthrough": return router # then we add the last step diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 0a0447de95cd8..2c4ea4af450f2 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -14,6 +14,7 @@ from ..utils._encode import _check_unknown, _encode, _get_counts, _unique from ..utils._mask import _get_mask from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions +from ..utils._set_output import _get_output_config from ..utils.validation import _check_feature_names_in, check_is_fitted __all__ = ["OneHotEncoder", "OrdinalEncoder"] @@ -176,11 +177,11 @@ def _transform( warn_on_unknown=False, ignore_category_indices=None, ): - self._check_feature_names(X, reset=False) - self._check_n_features(X, reset=False) X_list, n_samples, n_features = self._check_X( X, force_all_finite=force_all_finite ) + self._check_feature_names(X, reset=False) + self._check_n_features(X, reset=False) X_int = np.zeros((n_samples, n_features), dtype=int) X_mask = np.ones((n_samples, n_features), dtype=bool) @@ -437,7 +438,7 @@ def _map_infrequent_categories(self, X_int, X_mask, ignore_category_indices): X_int[rows_to_update, i] = np.take(mapping, X_int[rows_to_update, i]) def _more_tags(self): - return {"X_types": ["categorical"]} + return {"X_types": ["2darray", "categorical"], "allow_nan": True} class OneHotEncoder(_BaseEncoder): @@ -1008,6 +1009,14 @@ def transform(self, X): returned. """ check_is_fitted(self) + transform_output = _get_output_config("transform", estimator=self)["dense"] + if transform_output == "pandas" and self.sparse_output: + raise ValueError( + "Pandas output does not support sparse data. Set sparse_output=False to" + " output pandas DataFrames or disable pandas output via" + ' `ohe.set_output(transform="default").' + ) + # validation of X happens in _check_X called by _transform warn_on_unknown = self.drop is not None and self.handle_unknown in { "ignore", @@ -1499,15 +1508,11 @@ def fit(self, X, y=None): if infrequent is not None: cardinalities[feature_idx] -= len(infrequent) - # stores the missing indices per category - self._missing_indices = {} + # missing values are not considered part of the cardinality + # when considering unknown categories or encoded_missing_value for cat_idx, categories_for_idx in enumerate(self.categories_): - for i, cat in enumerate(categories_for_idx): + for cat in categories_for_idx: if is_scalar_nan(cat): - self._missing_indices[cat_idx] = i - - # missing values are not considered part of the cardinality - # when considering unknown categories or encoded_missing_value cardinalities[cat_idx] -= 1 continue diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index d621c6c410153..3008710d3c3dc 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -31,7 +31,7 @@ ] -class LabelEncoder(TransformerMixin, BaseEstimator): +class LabelEncoder(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): """Encode target labels with value between 0 and n_classes-1. This transformer should be used to encode target values, *i.e.* `y`, and @@ -56,8 +56,8 @@ class LabelEncoder(TransformerMixin, BaseEstimator): -------- `LabelEncoder` can be used to normalize labels. - >>> from sklearn import preprocessing - >>> le = preprocessing.LabelEncoder() + >>> from sklearn.preprocessing import LabelEncoder + >>> le = LabelEncoder() >>> le.fit([1, 2, 2, 6]) LabelEncoder() >>> le.classes_ @@ -70,7 +70,7 @@ class LabelEncoder(TransformerMixin, BaseEstimator): It can also be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels. - >>> le = preprocessing.LabelEncoder() + >>> le = LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) LabelEncoder() >>> list(le.classes_) @@ -165,7 +165,7 @@ def _more_tags(self): return {"X_types": ["1dlabels"]} -class LabelBinarizer(TransformerMixin, BaseEstimator): +class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): """Binarize labels in a one-vs-all fashion. Several regression and binary classification algorithms are @@ -221,8 +221,8 @@ class LabelBinarizer(TransformerMixin, BaseEstimator): Examples -------- - >>> from sklearn import preprocessing - >>> lb = preprocessing.LabelBinarizer() + >>> from sklearn.preprocessing import LabelBinarizer + >>> lb = LabelBinarizer() >>> lb.fit([1, 2, 6, 4, 2]) LabelBinarizer() >>> lb.classes_ @@ -233,7 +233,7 @@ class LabelBinarizer(TransformerMixin, BaseEstimator): Binary targets transform to a column vector - >>> lb = preprocessing.LabelBinarizer() + >>> lb = LabelBinarizer() >>> lb.fit_transform(['yes', 'no', 'no', 'yes']) array([[1], [0], @@ -685,7 +685,7 @@ def _inverse_binarize_thresholding(y, output_type, classes, threshold): raise ValueError("{0} format is not supported".format(output_type)) -class MultiLabelBinarizer(TransformerMixin, BaseEstimator): +class MultiLabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): """Transform between iterable of iterables and a multilabel format. Although a list of sets or tuples is a very intuitive format for multilabel diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py index ea2f3b202bac4..81afeb6a8bd43 100644 --- a/sklearn/preprocessing/_target_encoder.py +++ b/sklearn/preprocessing/_target_encoder.py @@ -27,8 +27,8 @@ class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder): .. note:: `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a - cross fitting scheme is used in `fit_transform` for encoding. See the - :ref:`User Guide ` for details. + :term:`cross fitting` scheme is used in `fit_transform` for encoding. + See the :ref:`User Guide ` for details. .. versionadded:: 1.3 @@ -68,7 +68,7 @@ class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder): If `"auto"`, then `smooth` is set to an empirical Bayes estimate. cv : int, default=5 - Determines the number of folds in the cross fitting strategy used in + Determines the number of folds in the :term:`cross fitting` strategy used in :meth:`fit_transform`. For classification targets, `StratifiedKFold` is used and for continuous targets, `KFold` is used. @@ -204,8 +204,8 @@ def fit_transform(self, X, y): .. note:: `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a - cross fitting scheme is used in `fit_transform` for encoding. See the - :ref:`User Guide `. for details. + :term:`cross fitting` scheme is used in `fit_transform` for encoding. + See the :ref:`User Guide `. for details. Parameters ---------- @@ -260,8 +260,8 @@ def transform(self, X): .. note:: `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a - cross fitting scheme is used in `fit_transform` for encoding. See the - :ref:`User Guide `. for details. + :term:`cross fitting` scheme is used in `fit_transform` for encoding. + See the :ref:`User Guide `. for details. Parameters ---------- @@ -273,14 +273,14 @@ def transform(self, X): X_trans : ndarray of shape (n_samples, n_features) Transformed input. """ - X_ordinal, X_valid = self._transform( + X_ordinal, X_known_mask = self._transform( X, handle_unknown="ignore", force_all_finite="allow-nan" ) X_out = np.empty_like(X_ordinal, dtype=np.float64) self._transform_X_ordinal( X_out, X_ordinal, - ~X_valid, + ~X_known_mask, slice(None), self.encodings_, self.target_mean_, @@ -299,8 +299,9 @@ def _fit_encodings_all(self, X, y): inferred_type_of_target = type_of_target(y, input_name="y") if inferred_type_of_target not in accepted_target_types: raise ValueError( - f"Target type was inferred to be {inferred_type_of_target!r}. Only" - f" {accepted_target_types} are supported." + "Unknown label type: Target type was inferred to be " + f"{inferred_type_of_target!r}. Only {accepted_target_types} are " + "supported." ) self.target_type_ = inferred_type_of_target else: @@ -343,4 +344,13 @@ def _transform_X_ordinal( X_out[X_unknown_mask[:, f_idx], f_idx] = y_mean def _more_tags(self): - return {"requires_y": True} + return { + "requires_y": True, + # TargetEncoder is a special case where a transformer uses `y` but + # only accept binary classification and regression targets. For the + # purpose of common tests we use `binary_only` tag to eliminate the + # multiclass tests. TODO: remove this special case when multiclass + # support is added to TargetEncoder. xref: + # https://github.com/scikit-learn/scikit-learn/pull/26674 + "binary_only": True, + } diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index ca809dd513cf3..9ba041c90f5de 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -1588,6 +1588,26 @@ def test_ohe_drop_first_explicit_categories(handle_unknown): assert_allclose(X_trans, X_expected) +def test_ohe_more_informative_error_message(): + """Raise informative error message when pandas output and sparse_output=True.""" + pd = pytest.importorskip("pandas") + df = pd.DataFrame({"a": [1, 2, 3], "b": ["z", "b", "b"]}, columns=["a", "b"]) + + ohe = OneHotEncoder(sparse_output=True) + ohe.set_output(transform="pandas") + + msg = ( + "Pandas output does not support sparse data. Set " + "sparse_output=False to output pandas DataFrames or disable pandas output" + ) + with pytest.raises(ValueError, match=msg): + ohe.fit_transform(df) + + ohe.fit(df) + with pytest.raises(ValueError, match=msg): + ohe.transform(df) + + def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype(): """Test ordinal encoder with nan passthrough fails when dtype=np.int32.""" diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index fa19171503a1d..7c4bb01535dca 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -6,7 +6,6 @@ from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer -from sklearn.utils import _safe_indexing from sklearn.utils._testing import ( _convert_container, assert_allclose_dense_sparse, @@ -196,9 +195,7 @@ def test_function_transformer_raise_error_with_mixed_dtype(X_type): data = _convert_container(data, X_type, columns_name=["value"], dtype=dtype) def func(X): - return np.array( - [mapping[_safe_indexing(X, i)] for i in range(X.size)], dtype=object - ) + return np.array([mapping[X[i]] for i in range(X.size)], dtype=object) def inverse_func(X): return _convert_container( diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 7d413063968e4..633a386c75951 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -672,3 +672,17 @@ def test_nan_label_encoder(): y_trans = le.transform([np.nan]) assert_array_equal(y_trans, [2]) + + +@pytest.mark.parametrize( + "encoder", [LabelEncoder(), LabelBinarizer(), MultiLabelBinarizer()] +) +def test_label_encoders_do_not_have_set_output(encoder): + """Check that label encoders do not define set_output and work with y as a kwarg. + + Non-regression test for #26854. + """ + assert not hasattr(encoder, "set_output") + y_encoded_with_kwarg = encoder.fit_transform(y=["a", "b", "c"]) + y_encoded_positional = encoder.fit_transform(["a", "b", "c"]) + assert_array_equal(y_encoded_with_kwarg, y_encoded_positional) diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 2fc5e04b5df83..eb126ec77e526 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -825,7 +825,7 @@ def predict(self, X): def _check_proba(self): if not self.probability: raise AttributeError( - "predict_proba is not available when probability=False" + "predict_proba is not available when probability=False" ) if self._impl not in ("c_svc", "nu_svc"): raise AttributeError("predict_proba only implemented for SVC and NuSVC") @@ -835,7 +835,7 @@ def _check_proba(self): def predict_proba(self, X): """Compute probabilities of possible outcomes for samples in X. - The model need to have probability information computed at training + The model needs to have probability information computed at training time: fit with attribute `probability` set to True. Parameters @@ -1095,18 +1095,26 @@ def _fit_liblinear( Target vector relative to X C : float - Inverse of cross-validation parameter. Lower the C, the more + Inverse of cross-validation parameter. The lower the C, the higher the penalization. fit_intercept : bool - Whether or not to fit the intercept, that is to add a intercept - term to the decision function. + Whether or not to fit an intercept. If set to True, the feature vector + is extended to include an intercept term: ``[x_1, ..., x_n, 1]``, where + 1 corresponds to the intercept. If set to False, no intercept will be + used in calculations (i.e. data is expected to be already centered). intercept_scaling : float - LibLinear internally penalizes the intercept and this term is subject - to regularization just like the other terms of the feature vector. - In order to avoid this, one should increase the intercept_scaling. - such that the feature vector becomes [x, intercept_scaling]. + Liblinear internally penalizes the intercept, treating it like any + other term in the feature vector. To reduce the impact of the + regularization on the intercept, the `intercept_scaling` parameter can + be set to a value greater than 1; the higher the value of + `intercept_scaling`, the lower the impact of regularization on it. + Then, the weights become `[w_x_1, ..., w_x_n, + w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent + the feature weights and the intercept weight is scaled by + `intercept_scaling`. This scaling allows the intercept term to have a + different regularization behavior compared to the other features. class_weight : dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 7a54c02201ccb..dfa48b4937147 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -49,6 +49,10 @@ class LinearSVC(LinearClassifierMixin, SparseCoefMixin, BaseEstimator): penalties and loss functions and should scale better to large numbers of samples. + The main differences between :class:`~sklearn.svm.LinearSVC` and + :class:`~sklearn.svm.SVC` lie in the loss function used by default, and in + the handling of intercept regularization between those two implementations. + This class supports both dense and sparse input and the multiclass support is handled according to a one-vs-the-rest scheme. @@ -99,20 +103,26 @@ class LinearSVC(LinearClassifierMixin, SparseCoefMixin, BaseEstimator): will be ignored. fit_intercept : bool, default=True - Whether to calculate the intercept for this model. If set - to false, no intercept will be used in calculations - (i.e. data is expected to be already centered). + Whether or not to fit an intercept. If set to True, the feature vector + is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where + 1 corresponds to the intercept. If set to False, no intercept will be + used in calculations (i.e. data is expected to be already centered). intercept_scaling : float, default=1.0 - When self.fit_intercept is True, instance vector x becomes - ``[x, self.intercept_scaling]``, - i.e. a "synthetic" feature with constant value equals to - intercept_scaling is appended to the instance vector. - The intercept becomes intercept_scaling * synthetic feature weight - Note! the synthetic feature weight is subject to l1/l2 regularization - as all other features. - To lessen the effect of regularization on synthetic feature weight - (and therefore on the intercept) intercept_scaling has to be increased. + When `fit_intercept` is True, the instance vector x becomes ``[x_1, + ..., x_n, intercept_scaling]``, i.e. a "synthetic" feature with a + constant value equal to `intercept_scaling` is appended to the instance + vector. The intercept becomes intercept_scaling * synthetic feature + weight. Note that liblinear internally penalizes the intercept, + treating it like any other term in the feature vector. To reduce the + impact of the regularization on the intercept, the `intercept_scaling` + parameter can be set to a value greater than 1; the higher the value of + `intercept_scaling`, the lower the impact of regularization on it. + Then, the weights become `[w_x_1, ..., w_x_n, + w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent + the feature weights and the intercept weight is scaled by + `intercept_scaling`. This scaling allows the intercept term to have a + different regularization behavior compared to the other features. class_weight : dict or 'balanced', default=None Set the parameter C of class i to ``class_weight[i]*C`` for @@ -362,6 +372,10 @@ class LinearSVR(RegressorMixin, LinearModel): penalties and loss functions and should scale better to large numbers of samples. + The main differences between :class:`~sklearn.svm.LinearSVR` and + :class:`~sklearn.svm.SVR` lie in the loss function used by default, and in + the handling of intercept regularization between those two implementations. + This class supports both dense and sparse input. Read more in the :ref:`User Guide `. @@ -389,20 +403,26 @@ class LinearSVR(RegressorMixin, LinearModel): loss ('squared_epsilon_insensitive') is the L2 loss. fit_intercept : bool, default=True - Whether to calculate the intercept for this model. If set - to false, no intercept will be used in calculations - (i.e. data is expected to be already centered). + Whether or not to fit an intercept. If set to True, the feature vector + is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where + 1 corresponds to the intercept. If set to False, no intercept will be + used in calculations (i.e. data is expected to be already centered). intercept_scaling : float, default=1.0 - When self.fit_intercept is True, instance vector x becomes - [x, self.intercept_scaling], - i.e. a "synthetic" feature with constant value equals to - intercept_scaling is appended to the instance vector. - The intercept becomes intercept_scaling * synthetic feature weight - Note! the synthetic feature weight is subject to l1/l2 regularization - as all other features. - To lessen the effect of regularization on synthetic feature weight - (and therefore on the intercept) intercept_scaling has to be increased. + When `fit_intercept` is True, the instance vector x becomes `[x_1, ..., + x_n, intercept_scaling]`, i.e. a "synthetic" feature with a constant + value equal to `intercept_scaling` is appended to the instance vector. + The intercept becomes intercept_scaling * synthetic feature weight. + Note that liblinear internally penalizes the intercept, treating it + like any other term in the feature vector. To reduce the impact of the + regularization on the intercept, the `intercept_scaling` parameter can + be set to a value greater than 1; the higher the value of + `intercept_scaling`, the lower the impact of regularization on it. + Then, the weights become `[w_x_1, ..., w_x_n, + w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent + the feature weights and the intercept weight is scaled by + `intercept_scaling`. This scaling allows the intercept term to have a + different regularization behavior compared to the other features. dual : "auto" or bool, default=True Select the algorithm to either solve the dual or primal @@ -462,8 +482,8 @@ class LinearSVR(RegressorMixin, LinearModel): same library as this class (liblinear). SVR : Implementation of Support Vector Machine regression using libsvm: - the kernel can be non-linear but its SMO algorithm does not - scale to large number of samples as LinearSVC does. + the kernel can be non-linear but its SMO algorithm does not scale to + large number of samples as :class:`~sklearn.svm.LinearSVR` does. sklearn.linear_model.SGDRegressor : SGDRegressor can optimize the same cost function as LinearSVR @@ -774,7 +794,7 @@ class SVC(BaseSVC): Indices of support vectors. support_vectors_ : ndarray of shape (n_SV, n_features) - Support vectors. + Support vectors. An empty array if kernel is precomputed. n_support_ : ndarray of shape (n_classes,), dtype=int32 Number of support vectors for each class. diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py new file mode 100644 index 0000000000000..59166e6687369 --- /dev/null +++ b/sklearn/tests/metadata_routing_common.py @@ -0,0 +1,407 @@ +from functools import partial + +import numpy as np + +from sklearn.base import ( + BaseEstimator, + ClassifierMixin, + MetaEstimatorMixin, + RegressorMixin, + TransformerMixin, + clone, +) +from sklearn.metrics._scorer import _PredictScorer, mean_squared_error +from sklearn.model_selection import BaseCrossValidator +from sklearn.model_selection._split import GroupsConsumerMixin +from sklearn.utils._metadata_requests import ( + SIMPLE_METHODS, +) +from sklearn.utils.metadata_routing import ( + MetadataRouter, + process_routing, +) + + +def record_metadata(obj, method, record_default=True, **kwargs): + """Utility function to store passed metadata to a method. + + If record_default is False, kwargs whose values are "default" are skipped. + This is so that checks on keyword arguments whose default was not changed + are skipped. + + """ + if not hasattr(obj, "_records"): + obj._records = {} + if not record_default: + kwargs = { + key: val + for key, val in kwargs.items() + if not isinstance(val, str) or (val != "default") + } + obj._records[method] = kwargs + + +def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs): + """Check whether the expected metadata is passed to the object's method. + + Parameters + ---------- + split_params : tuple, default=empty + specifies any parameters which are to be checked as being a subset + of the original values. + + """ + records = getattr(obj, "_records", dict()).get(method, dict()) + assert set(kwargs.keys()) == set(records.keys()) + for key, value in kwargs.items(): + recorded_value = records[key] + # The following condition is used to check for any specified parameters + # being a subset of the original values + if key in split_params and recorded_value is not None: + assert np.isin(recorded_value, value).all() + else: + assert recorded_value is value + + +record_metadata_not_default = partial(record_metadata, record_default=False) + + +def assert_request_is_empty(metadata_request, exclude=None): + """Check if a metadata request dict is empty. + + One can exclude a method or a list of methods from the check using the + ``exclude`` parameter. + """ + if isinstance(metadata_request, MetadataRouter): + for _, route_mapping in metadata_request: + assert_request_is_empty(route_mapping.router) + return + + exclude = [] if exclude is None else exclude + for method in SIMPLE_METHODS: + if method in exclude: + continue + mmr = getattr(metadata_request, method) + props = [ + prop + for prop, alias in mmr.requests.items() + if isinstance(alias, str) or alias is not None + ] + assert not len(props) + + +def assert_request_equal(request, dictionary): + for method, requests in dictionary.items(): + mmr = getattr(request, method) + assert mmr.requests == requests + + empty_methods = [method for method in SIMPLE_METHODS if method not in dictionary] + for method in empty_methods: + assert not len(getattr(request, method).requests) + + +class _Registry(list): + # This list is used to get a reference to the sub-estimators, which are not + # necessarily stored on the metaestimator. We need to override __deepcopy__ + # because the sub-estimators are probably cloned, which would result in a + # new copy of the list, but we need copy and deep copy both to return the + # same instance. + def __deepcopy__(self, memo): + return self + + def __copy__(self): + return self + + +class ConsumingRegressor(RegressorMixin, BaseEstimator): + """A regressor consuming metadata. + + Parameters + ---------- + registry : list, default=None + If a list, the estimator will append itself to the list in order to have + a reference to the estimator later on. Since that reference is not + required in all tests, registration can be skipped by leaving this value + as None. + + """ + + def __init__(self, registry=None): + self.registry = registry + + def partial_fit(self, X, y, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, "partial_fit", sample_weight=sample_weight, metadata=metadata + ) + return self + + def fit(self, X, y, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, "fit", sample_weight=sample_weight, metadata=metadata + ) + return self + + def predict(self, X, sample_weight="default", metadata="default"): + pass # pragma: no cover + + # when needed, uncomment the implementation + # if self.registry is not None: + # self.registry.append(self) + + # record_metadata_not_default( + # self, "predict", sample_weight=sample_weight, metadata=metadata + # ) + # return np.zeros(shape=(len(X),)) + + +class NonConsumingClassifier(ClassifierMixin, BaseEstimator): + """A classifier which accepts no metadata on any method.""" + + def __init__(self, registry=None): + self.registry = registry + + def fit(self, X, y): + if self.registry is not None: + self.registry.append(self) + + self.classes_ = [0, 1] + return self + + def predict(self, X): + return np.ones(len(X)) # pragma: no cover + + +class ConsumingClassifier(ClassifierMixin, BaseEstimator): + """A classifier consuming metadata. + + Parameters + ---------- + registry : list, default=None + If a list, the estimator will append itself to the list in order to have + a reference to the estimator later on. Since that reference is not + required in all tests, registration can be skipped by leaving this value + as None. + + """ + + def __init__(self, registry=None): + self.registry = registry + + def partial_fit(self, X, y, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, "partial_fit", sample_weight=sample_weight, metadata=metadata + ) + self.classes_ = [0, 1] + return self + + def fit(self, X, y, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, "fit", sample_weight=sample_weight, metadata=metadata + ) + self.classes_ = [0, 1] + return self + + def predict(self, X, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, "predict", sample_weight=sample_weight, metadata=metadata + ) + return np.zeros(shape=(len(X),)) + + def predict_proba(self, X, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, "predict_proba", sample_weight=sample_weight, metadata=metadata + ) + return np.asarray([[0.0, 1.0]] * len(X)) + + def predict_log_proba(self, X, sample_weight="default", metadata="default"): + pass # pragma: no cover + + # when needed, uncomment the implementation + # if self.registry is not None: + # self.registry.append(self) + + # record_metadata_not_default( + # self, "predict_log_proba", sample_weight=sample_weight, metadata=metadata + # ) + # return np.zeros(shape=(len(X), 2)) + + +class ConsumingTransformer(TransformerMixin, BaseEstimator): + """A transformer which accepts metadata on fit and transform. + + Parameters + ---------- + registry : list, default=None + If a list, the estimator will append itself to the list in order to have + a reference to the estimator later on. Since that reference is not + required in all tests, registration can be skipped by leaving this value + as None. + """ + + def __init__(self, registry=None): + self.registry = registry + + def fit(self, X, y=None, sample_weight=None, metadata=None): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, "fit", sample_weight=sample_weight, metadata=metadata + ) + return self + + def transform(self, X, sample_weight=None): + record_metadata(self, "transform", sample_weight=sample_weight) + return X + + +class ConsumingScorer(_PredictScorer): + def __init__(self, registry=None): + super().__init__(score_func=mean_squared_error, sign=1, kwargs={}) + self.registry = registry + + def _score(self, method_caller, clf, X, y, **kwargs): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default(self, "score", **kwargs) + + sample_weight = kwargs.get("sample_weight", None) + return super()._score(method_caller, clf, X, y, sample_weight=sample_weight) + + +class ConsumingSplitter(BaseCrossValidator, GroupsConsumerMixin): + def __init__(self, registry=None): + self.registry = registry + + def split(self, X, y=None, groups="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default(self, "split", groups=groups, metadata=metadata) + + split_index = len(X) // 2 + train_indices = list(range(0, split_index)) + test_indices = list(range(split_index, len(X))) + yield test_indices, train_indices + yield train_indices, test_indices + + def get_n_splits(self, X=None, y=None, groups=None): + pass # pragma: no cover + + def _iter_test_indices(self, X=None, y=None, groups=None): + split_index = len(X) // 2 + train_indices = list(range(0, split_index)) + test_indices = list(range(split_index, len(X))) + yield test_indices + yield train_indices + + +class MetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator): + """A meta-regressor which is only a router.""" + + def __init__(self, estimator): + self.estimator = estimator + + def fit(self, X, y, **fit_params): + params = process_routing(self, "fit", **fit_params) + self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) + + def get_metadata_routing(self): + router = MetadataRouter(owner=self.__class__.__name__).add( + estimator=self.estimator, method_mapping="one-to-one" + ) + return router + + +class WeightedMetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator): + """A meta-regressor which is also a consumer.""" + + def __init__(self, estimator, registry=None): + self.estimator = estimator + self.registry = registry + + def fit(self, X, y, sample_weight=None, **fit_params): + if self.registry is not None: + self.registry.append(self) + + record_metadata(self, "fit", sample_weight=sample_weight) + params = process_routing(self, "fit", sample_weight=sample_weight, **fit_params) + self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) + return self + + def predict(self, X, **predict_params): + params = process_routing(self, "predict", **predict_params) + return self.estimator_.predict(X, **params.estimator.predict) + + def get_metadata_routing(self): + router = ( + MetadataRouter(owner=self.__class__.__name__) + .add_self_request(self) + .add(estimator=self.estimator, method_mapping="one-to-one") + ) + return router + + +class WeightedMetaClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): + """A meta-estimator which also consumes sample_weight itself in ``fit``.""" + + def __init__(self, estimator, registry=None): + self.estimator = estimator + self.registry = registry + + def fit(self, X, y, sample_weight=None, **kwargs): + if self.registry is not None: + self.registry.append(self) + + record_metadata(self, "fit", sample_weight=sample_weight) + params = process_routing(self, "fit", sample_weight=sample_weight, **kwargs) + self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) + return self + + def get_metadata_routing(self): + router = ( + MetadataRouter(owner=self.__class__.__name__) + .add_self_request(self) + .add(estimator=self.estimator, method_mapping="fit") + ) + return router + + +class MetaTransformer(MetaEstimatorMixin, TransformerMixin, BaseEstimator): + """A simple meta-transformer.""" + + def __init__(self, transformer): + self.transformer = transformer + + def fit(self, X, y=None, **fit_params): + params = process_routing(self, "fit", **fit_params) + self.transformer_ = clone(self.transformer).fit(X, y, **params.transformer.fit) + return self + + def transform(self, X, y=None, **transform_params): + params = process_routing(self, "transform", **transform_params) + return self.transformer_.transform(X, **params.transformer.transform) + + def get_metadata_routing(self): + return MetadataRouter(owner=self.__class__.__name__).add( + transformer=self.transformer, method_mapping="one-to-one" + ) diff --git a/sklearn/tests/test_metadata_routing.py b/sklearn/tests/test_metadata_routing.py index 3fc6a9c337f47..50b6f912667ba 100644 --- a/sklearn/tests/test_metadata_routing.py +++ b/sklearn/tests/test_metadata_routing.py @@ -13,13 +13,23 @@ from sklearn import config_context from sklearn.base import ( BaseEstimator, - ClassifierMixin, - MetaEstimatorMixin, - RegressorMixin, - TransformerMixin, clone, ) from sklearn.linear_model import LinearRegression +from sklearn.tests.metadata_routing_common import ( + ConsumingClassifier, + ConsumingRegressor, + ConsumingTransformer, + MetaRegressor, + MetaTransformer, + NonConsumingClassifier, + WeightedMetaClassifier, + WeightedMetaRegressor, + _Registry, + assert_request_equal, + assert_request_is_empty, + check_recorded_metadata, +) from sklearn.utils import metadata_routing from sklearn.utils._metadata_requests import ( COMPOSITE_METHODS, @@ -56,209 +66,6 @@ def enable_slep006(): yield -def assert_request_is_empty(metadata_request, exclude=None): - """Check if a metadata request dict is empty. - - One can exclude a method or a list of methods from the check using the - ``exclude`` parameter. - """ - if isinstance(metadata_request, MetadataRouter): - for _, route_mapping in metadata_request: - assert_request_is_empty(route_mapping.router) - return - - exclude = [] if exclude is None else exclude - for method in SIMPLE_METHODS: - if method in exclude: - continue - mmr = getattr(metadata_request, method) - props = [ - prop - for prop, alias in mmr.requests.items() - if isinstance(alias, str) or alias is not None - ] - assert not len(props) - - -def assert_request_equal(request, dictionary): - for method, requests in dictionary.items(): - mmr = getattr(request, method) - assert mmr.requests == requests - - empty_methods = [method for method in SIMPLE_METHODS if method not in dictionary] - for method in empty_methods: - assert not len(getattr(request, method).requests) - - -def record_metadata(obj, method, record_default=True, **kwargs): - """Utility function to store passed metadata to a method. - - If record_default is False, kwargs whose values are "default" are skipped. - This is so that checks on keyword arguments whose default was not changed - are skipped. - - """ - if not hasattr(obj, "_records"): - obj._records = {} - if not record_default: - kwargs = { - key: val - for key, val in kwargs.items() - if not isinstance(val, str) or (val != "default") - } - obj._records[method] = kwargs - - -def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs): - """Check whether the expected metadata is passed to the object's method. - - Parameters - ---------- - split_params : tuple, default=empty - specifies any parameters which are to be checked as being a subset - of the original values. - - """ - records = getattr(obj, "_records", dict()).get(method, dict()) - assert set(kwargs.keys()) == set(records.keys()) - for key, value in kwargs.items(): - recorded_value = records[key] - # The following condition is used to check for any specified parameters - # being a subset of the original values - if key in split_params and recorded_value is not None: - assert np.isin(recorded_value, value).all() - else: - assert recorded_value is value - - -class MetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator): - """A meta-regressor which is only a router.""" - - def __init__(self, estimator): - self.estimator = estimator - - def fit(self, X, y, **fit_params): - params = process_routing(self, "fit", fit_params) - self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) - - def get_metadata_routing(self): - router = MetadataRouter(owner=self.__class__.__name__).add( - estimator=self.estimator, method_mapping="one-to-one" - ) - return router - - -class RegressorMetadata(RegressorMixin, BaseEstimator): - """A regressor consuming a metadata.""" - - def fit(self, X, y, sample_weight=None): - record_metadata(self, "fit", sample_weight=sample_weight) - return self - - def predict(self, X): - return np.zeros(shape=(len(X))) - - -class WeightedMetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator): - """A meta-regressor which is also a consumer.""" - - def __init__(self, estimator): - self.estimator = estimator - - def fit(self, X, y, sample_weight=None, **fit_params): - record_metadata(self, "fit", sample_weight=sample_weight) - params = process_routing(self, "fit", fit_params, sample_weight=sample_weight) - self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) - return self - - def predict(self, X, **predict_params): - params = process_routing(self, "predict", predict_params) - return self.estimator_.predict(X, **params.estimator.predict) - - def get_metadata_routing(self): - router = ( - MetadataRouter(owner=self.__class__.__name__) - .add_self_request(self) - .add(estimator=self.estimator, method_mapping="one-to-one") - ) - return router - - -class ClassifierNoMetadata(ClassifierMixin, BaseEstimator): - """An estimator which accepts no metadata on any method.""" - - def fit(self, X, y): - return self - - def predict(self, X): - return np.ones(len(X)) # pragma: no cover - - -class ClassifierFitMetadata(ClassifierMixin, BaseEstimator): - """An estimator accepting two metadata in its ``fit`` method.""" - - def fit(self, X, y, sample_weight=None, brand=None): - record_metadata(self, "fit", sample_weight=sample_weight, brand=brand) - return self - - def predict(self, X): - return np.ones(len(X)) # pragma: no cover - - -class SimpleMetaClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): - """A meta-estimator which also consumes sample_weight itself in ``fit``.""" - - def __init__(self, estimator): - self.estimator = estimator - - def fit(self, X, y, sample_weight=None, **kwargs): - record_metadata(self, "fit", sample_weight=sample_weight) - params = process_routing(self, "fit", kwargs, sample_weight=sample_weight) - self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) - return self - - def get_metadata_routing(self): - router = ( - MetadataRouter(owner=self.__class__.__name__) - .add_self_request(self) - .add(estimator=self.estimator, method_mapping="fit") - ) - return router - - -class TransformerMetadata(TransformerMixin, BaseEstimator): - """A transformer which accepts metadata on fit and transform.""" - - def fit(self, X, y=None, brand=None, sample_weight=None): - record_metadata(self, "fit", brand=brand, sample_weight=sample_weight) - return self - - def transform(self, X, sample_weight=None): - record_metadata(self, "transform", sample_weight=sample_weight) - return X - - -class MetaTransformer(MetaEstimatorMixin, TransformerMixin, BaseEstimator): - """A simple meta-transformer.""" - - def __init__(self, transformer): - self.transformer = transformer - - def fit(self, X, y=None, **fit_params): - params = process_routing(self, "fit", fit_params) - self.transformer_ = clone(self.transformer).fit(X, y, **params.transformer.fit) - return self - - def transform(self, X, y=None, **transform_params): - params = process_routing(self, "transform", transform_params) - return self.transformer_.transform(X, **params.transformer.transform) - - def get_metadata_routing(self): - return MetadataRouter(owner=self.__class__.__name__).add( - transformer=self.transformer, method_mapping="one-to-one" - ) - - class SimplePipeline(BaseEstimator): """A very simple pipeline, assuming the last step is always a predictor.""" @@ -267,7 +74,7 @@ def __init__(self, steps): def fit(self, X, y, **fit_params): self.steps_ = [] - params = process_routing(self, "fit", fit_params) + params = process_routing(self, "fit", **fit_params) X_transformed = X for i, step in enumerate(self.steps[:-1]): transformer = clone(step).fit( @@ -286,7 +93,7 @@ def fit(self, X, y, **fit_params): def predict(self, X, **predict_params): check_is_fitted(self) X_transformed = X - params = process_routing(self, "predict", predict_params) + params = process_routing(self, "predict", **predict_params) for i, step in enumerate(self.steps_[:-1]): X_transformed = step.transform(X, **params.get(f"step_{i}").transform) @@ -334,10 +141,27 @@ def test_assert_request_is_empty(): assert_request_is_empty( MetadataRouter(owner="test") .add_self_request(WeightedMetaRegressor(estimator=None)) - .add(method_mapping="fit", estimator=RegressorMetadata()) + .add(method_mapping="fit", estimator=ConsumingRegressor()) ) +@pytest.mark.parametrize( + "estimator", + [ + ConsumingClassifier(registry=_Registry()), + ConsumingRegressor(registry=_Registry()), + ConsumingTransformer(registry=_Registry()), + NonConsumingClassifier(registry=_Registry()), + WeightedMetaClassifier(estimator=ConsumingClassifier(), registry=_Registry()), + WeightedMetaRegressor(estimator=ConsumingRegressor(), registry=_Registry()), + ], +) +def test_estimator_puts_self_in_registry(estimator): + """Check that an estimator puts itself in the registry upon fit.""" + estimator.fit(X, y) + assert estimator in estimator.registry + + @pytest.mark.parametrize( "val, res", [ @@ -383,90 +207,90 @@ class OddEstimator(BaseEstimator): assert odd_request.fit.requests == {"sample_weight": True} # check other test estimators - assert not len(get_routing_for_object(ClassifierNoMetadata()).fit.requests) - assert_request_is_empty(ClassifierNoMetadata().get_metadata_routing()) + assert not len(get_routing_for_object(NonConsumingClassifier()).fit.requests) + assert_request_is_empty(NonConsumingClassifier().get_metadata_routing()) - trs_request = get_routing_for_object(TransformerMetadata()) + trs_request = get_routing_for_object(ConsumingTransformer()) assert trs_request.fit.requests == { "sample_weight": None, - "brand": None, + "metadata": None, } assert trs_request.transform.requests == { "sample_weight": None, } assert_request_is_empty(trs_request) - est_request = get_routing_for_object(ClassifierFitMetadata()) + est_request = get_routing_for_object(ConsumingClassifier()) assert est_request.fit.requests == { "sample_weight": None, - "brand": None, + "metadata": None, } assert_request_is_empty(est_request) def test_process_routing_invalid_method(): with pytest.raises(TypeError, match="Can only route and process input"): - process_routing(ClassifierFitMetadata(), "invalid_method", {}) + process_routing(ConsumingClassifier(), "invalid_method", **{}) def test_process_routing_invalid_object(): class InvalidObject: pass - with pytest.raises(AttributeError, match="has not implemented the routing"): - process_routing(InvalidObject(), "fit", {}) + with pytest.raises(AttributeError, match="either implement the routing method"): + process_routing(InvalidObject(), "fit", **{}) def test_simple_metadata_routing(): # Tests that metadata is properly routed # The underlying estimator doesn't accept or request metadata - clf = SimpleMetaClassifier(estimator=ClassifierNoMetadata()) + clf = WeightedMetaClassifier(estimator=NonConsumingClassifier()) clf.fit(X, y) # Meta-estimator consumes sample_weight, but doesn't forward it to the underlying # estimator - clf = SimpleMetaClassifier(estimator=ClassifierNoMetadata()) + clf = WeightedMetaClassifier(estimator=NonConsumingClassifier()) clf.fit(X, y, sample_weight=my_weights) # If the estimator accepts the metadata but doesn't explicitly say it doesn't # need it, there's an error - clf = SimpleMetaClassifier(estimator=ClassifierFitMetadata()) + clf = WeightedMetaClassifier(estimator=ConsumingClassifier()) err_message = ( "[sample_weight] are passed but are not explicitly set as requested or" - " not for ClassifierFitMetadata.fit" + " not for ConsumingClassifier.fit" ) with pytest.raises(ValueError, match=re.escape(err_message)): clf.fit(X, y, sample_weight=my_weights) # Explicitly saying the estimator doesn't need it, makes the error go away, - # because in this case `SimpleMetaClassifier` consumes `sample_weight`. If + # because in this case `WeightedMetaClassifier` consumes `sample_weight`. If # there was no consumer of sample_weight, passing it would result in an # error. - clf = SimpleMetaClassifier( - estimator=ClassifierFitMetadata().set_fit_request(sample_weight=False) + clf = WeightedMetaClassifier( + estimator=ConsumingClassifier().set_fit_request(sample_weight=False) ) - # this doesn't raise since SimpleMetaClassifier itself is a consumer, + # this doesn't raise since WeightedMetaClassifier itself is a consumer, # and passing metadata to the consumer directly is fine regardless of its # metadata_request values. clf.fit(X, y, sample_weight=my_weights) - check_recorded_metadata(clf.estimator_, "fit", sample_weight=None, brand=None) + check_recorded_metadata(clf.estimator_, "fit") # Requesting a metadata will make the meta-estimator forward it correctly - clf = SimpleMetaClassifier( - estimator=ClassifierFitMetadata().set_fit_request(sample_weight=True) + clf = WeightedMetaClassifier( + estimator=ConsumingClassifier().set_fit_request(sample_weight=True) ) clf.fit(X, y, sample_weight=my_weights) - check_recorded_metadata(clf.estimator_, "fit", sample_weight=my_weights, brand=None) + check_recorded_metadata(clf.estimator_, "fit", sample_weight=my_weights) # And requesting it with an alias - clf = SimpleMetaClassifier( - estimator=ClassifierFitMetadata().set_fit_request( + clf = WeightedMetaClassifier( + estimator=ConsumingClassifier().set_fit_request( sample_weight="alternative_weight" ) ) clf.fit(X, y, alternative_weight=my_weights) - check_recorded_metadata(clf.estimator_, "fit", sample_weight=my_weights, brand=None) + check_recorded_metadata(clf.estimator_, "fit", sample_weight=my_weights) def test_nested_routing(): @@ -474,23 +298,23 @@ def test_nested_routing(): pipeline = SimplePipeline( [ MetaTransformer( - transformer=TransformerMetadata() - .set_fit_request(brand=True, sample_weight=False) + transformer=ConsumingTransformer() + .set_fit_request(metadata=True, sample_weight=False) .set_transform_request(sample_weight=True) ), WeightedMetaRegressor( - estimator=RegressorMetadata().set_fit_request( - sample_weight="inner_weights" - ) + estimator=ConsumingRegressor() + .set_fit_request(sample_weight="inner_weights", metadata=False) + .set_predict_request(sample_weight=False) ).set_fit_request(sample_weight="outer_weights"), ] ) w1, w2, w3 = [1], [2], [3] pipeline.fit( - X, y, brand=my_groups, sample_weight=w1, outer_weights=w2, inner_weights=w3 + X, y, metadata=my_groups, sample_weight=w1, outer_weights=w2, inner_weights=w3 ) check_recorded_metadata( - pipeline.steps_[0].transformer_, "fit", brand=my_groups, sample_weight=None + pipeline.steps_[0].transformer_, "fit", metadata=my_groups, sample_weight=None ) check_recorded_metadata( pipeline.steps_[0].transformer_, "transform", sample_weight=w1 @@ -509,12 +333,12 @@ def test_nested_routing_conflict(): pipeline = SimplePipeline( [ MetaTransformer( - transformer=TransformerMetadata() - .set_fit_request(brand=True, sample_weight=False) + transformer=ConsumingTransformer() + .set_fit_request(metadata=True, sample_weight=False) .set_transform_request(sample_weight=True) ), WeightedMetaRegressor( - estimator=RegressorMetadata().set_fit_request(sample_weight=True) + estimator=ConsumingRegressor().set_fit_request(sample_weight=True) ).set_fit_request(sample_weight="outer_weights"), ] ) @@ -530,13 +354,13 @@ def test_nested_routing_conflict(): ) ), ): - pipeline.fit(X, y, brand=my_groups, sample_weight=w1, outer_weights=w2) + pipeline.fit(X, y, metadata=my_groups, sample_weight=w1, outer_weights=w2) def test_invalid_metadata(): # check that passing wrong metadata raises an error trs = MetaTransformer( - transformer=TransformerMetadata().set_transform_request(sample_weight=True) + transformer=ConsumingTransformer().set_transform_request(sample_weight=True) ) with pytest.raises( TypeError, @@ -546,7 +370,7 @@ def test_invalid_metadata(): # passing a metadata which is not requested by any estimator should also raise trs = MetaTransformer( - transformer=TransformerMetadata().set_transform_request(sample_weight=False) + transformer=ConsumingTransformer().set_transform_request(sample_weight=False) ) with pytest.raises( TypeError, @@ -751,14 +575,14 @@ def test_metadata_router_consumes_method(): cases = [ ( WeightedMetaRegressor( - estimator=RegressorMetadata().set_fit_request(sample_weight=True) + estimator=ConsumingRegressor().set_fit_request(sample_weight=True) ), {"sample_weight"}, {"sample_weight"}, ), ( WeightedMetaRegressor( - estimator=RegressorMetadata().set_fit_request( + estimator=ConsumingRegressor().set_fit_request( sample_weight="my_weights" ) ), @@ -784,13 +608,13 @@ class WeightedMetaRegressorWarn(WeightedMetaRegressor): def test_estimator_warnings(): - class RegressorMetadataWarn(RegressorMetadata): + class ConsumingRegressorWarn(ConsumingRegressor): __metadata_request__fit = {"sample_weight": metadata_routing.WARN} with pytest.warns( UserWarning, match="Support for .* has recently been added to this class" ): - MetaRegressor(estimator=RegressorMetadataWarn()).fit( + MetaRegressor(estimator=ConsumingRegressorWarn()).fit( X, y, sample_weight=my_weights ) @@ -811,12 +635,14 @@ class RegressorMetadataWarn(RegressorMetadata): (MethodMapping.from_str("score"), "[{'callee': 'score', 'caller': 'score'}]"), ( MetadataRouter(owner="test").add( - method_mapping="predict", estimator=RegressorMetadata() + method_mapping="predict", estimator=ConsumingRegressor() ), ( - "{'estimator': {'mapping': [{'callee': 'predict', 'caller': " - "'predict'}], 'router': {'fit': {'sample_weight': None}, " - "'score': {'sample_weight': None}}}}" + "{'estimator': {'mapping': [{'callee': 'predict', 'caller':" + " 'predict'}], 'router': {'fit': {'sample_weight': None, 'metadata':" + " None}, 'partial_fit': {'sample_weight': None, 'metadata': None}," + " 'predict': {'sample_weight': None, 'metadata': None}, 'score':" + " {'sample_weight': None}}}}" ), ), ], @@ -857,7 +683,7 @@ def test_string_representations(obj, string): "Given `obj` is neither a `MetadataRequest` nor does it implement", ), ( - ClassifierFitMetadata(), + ConsumingClassifier(), "set_fit_request", {"invalid": True}, TypeError, @@ -900,14 +726,14 @@ def test_metadatarouter_add_self_request(): assert router._self_request is not request # one can add an estimator as self - est = RegressorMetadata().set_fit_request(sample_weight="my_weights") + est = ConsumingRegressor().set_fit_request(sample_weight="my_weights") router = MetadataRouter(owner="test").add_self_request(obj=est) assert str(router._self_request) == str(est.get_metadata_routing()) assert router._self_request is not est.get_metadata_routing() # adding a consumer+router as self should only add the consumer part est = WeightedMetaRegressor( - estimator=RegressorMetadata().set_fit_request(sample_weight="nested_weights") + estimator=ConsumingRegressor().set_fit_request(sample_weight="nested_weights") ) router = MetadataRouter(owner="test").add_self_request(obj=est) # _get_metadata_request() returns the consumer part of the requests @@ -923,25 +749,27 @@ def test_metadata_routing_add(): # adding one with a string `method_mapping` router = MetadataRouter(owner="test").add( method_mapping="fit", - est=RegressorMetadata().set_fit_request(sample_weight="weights"), + est=ConsumingRegressor().set_fit_request(sample_weight="weights"), ) assert ( str(router) - == "{'est': {'mapping': [{'callee': 'fit', 'caller': 'fit'}], " - "'router': {'fit': {'sample_weight': 'weights'}, 'score': " - "{'sample_weight': None}}}}" + == "{'est': {'mapping': [{'callee': 'fit', 'caller': 'fit'}], 'router': {'fit':" + " {'sample_weight': 'weights', 'metadata': None}, 'partial_fit':" + " {'sample_weight': None, 'metadata': None}, 'predict': {'sample_weight':" + " None, 'metadata': None}, 'score': {'sample_weight': None}}}}" ) # adding one with an instance of MethodMapping router = MetadataRouter(owner="test").add( method_mapping=MethodMapping().add(callee="score", caller="fit"), - est=RegressorMetadata().set_score_request(sample_weight=True), + est=ConsumingRegressor().set_score_request(sample_weight=True), ) assert ( str(router) - == "{'est': {'mapping': [{'callee': 'score', 'caller': 'fit'}], " - "'router': {'fit': {'sample_weight': None}, 'score': " - "{'sample_weight': True}}}}" + == "{'est': {'mapping': [{'callee': 'score', 'caller': 'fit'}], 'router':" + " {'fit': {'sample_weight': None, 'metadata': None}, 'partial_fit':" + " {'sample_weight': None, 'metadata': None}, 'predict': {'sample_weight':" + " None, 'metadata': None}, 'score': {'sample_weight': True}}}}" ) @@ -949,13 +777,13 @@ def test_metadata_routing_get_param_names(): router = ( MetadataRouter(owner="test") .add_self_request( - WeightedMetaRegressor(estimator=RegressorMetadata()).set_fit_request( + WeightedMetaRegressor(estimator=ConsumingRegressor()).set_fit_request( sample_weight="self_weights" ) ) .add( method_mapping="fit", - trs=TransformerMetadata().set_fit_request( + trs=ConsumingTransformer().set_fit_request( sample_weight="transform_weights" ), ) @@ -963,24 +791,23 @@ def test_metadata_routing_get_param_names(): assert ( str(router) - == "{'$self_request': {'fit': {'sample_weight': 'self_weights'}, 'score': " - "{'sample_weight': None}}, 'trs': {'mapping': [{'callee': 'fit', " - "'caller': 'fit'}], 'router': {'fit': {'brand': None, " - "'sample_weight': 'transform_weights'}, 'transform': " - "{'sample_weight': None}}}}" + == "{'$self_request': {'fit': {'sample_weight': 'self_weights'}, 'score':" + " {'sample_weight': None}}, 'trs': {'mapping': [{'callee': 'fit', 'caller':" + " 'fit'}], 'router': {'fit': {'sample_weight': 'transform_weights'," + " 'metadata': None}, 'transform': {'sample_weight': None}}}}" ) assert router._get_param_names( method="fit", return_alias=True, ignore_self_request=False - ) == {"transform_weights", "brand", "self_weights"} + ) == {"transform_weights", "metadata", "self_weights"} # return_alias=False will return original names for "self" assert router._get_param_names( method="fit", return_alias=False, ignore_self_request=False - ) == {"sample_weight", "brand", "transform_weights"} + ) == {"sample_weight", "metadata", "transform_weights"} # ignoring self would remove "sample_weight" assert router._get_param_names( method="fit", return_alias=False, ignore_self_request=True - ) == {"brand", "transform_weights"} + ) == {"metadata", "transform_weights"} # return_alias is ignored when ignore_self_request=True assert router._get_param_names( method="fit", return_alias=True, ignore_self_request=True @@ -1138,9 +965,9 @@ def test_no_feature_flag_raises_error(): """Test that when feature flag disabled, set_{method}_requests raises.""" with config_context(enable_metadata_routing=False): with pytest.raises(RuntimeError, match="This method is only available"): - ClassifierFitMetadata().set_fit_request(sample_weight=True) + ConsumingClassifier().set_fit_request(sample_weight=True) def test_none_metadata_passed(): """Test that passing None as metadata when not requested doesn't raise""" - MetaRegressor(estimator=RegressorMetadata()).fit(X, y, sample_weight=None) + MetaRegressor(estimator=ConsumingRegressor()).fit(X, y, sample_weight=None) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 768a57c61dc52..4a548fe9f067f 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -1,28 +1,27 @@ import copy import re -from functools import partial import numpy as np import pytest from sklearn import config_context -from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin from sklearn.calibration import CalibratedClassifierCV from sklearn.exceptions import UnsetMetadataPassedError from sklearn.linear_model import LogisticRegressionCV -from sklearn.metrics._scorer import _BaseScorer -from sklearn.model_selection import BaseCrossValidator -from sklearn.model_selection._split import GroupsConsumerMixin from sklearn.multioutput import ( ClassifierChain, MultiOutputClassifier, MultiOutputRegressor, RegressorChain, ) -from sklearn.tests.test_metadata_routing import ( +from sklearn.tests.metadata_routing_common import ( + ConsumingClassifier, + ConsumingRegressor, + ConsumingScorer, + ConsumingSplitter, + _Registry, assert_request_is_empty, check_recorded_metadata, - record_metadata, ) from sklearn.utils.metadata_routing import MetadataRouter @@ -43,179 +42,6 @@ def enable_slep006(): yield -record_metadata_not_default = partial(record_metadata, record_default=False) - - -class _Registry(list): - # This list is used to get a reference to the sub-estimators, which are not - # necessarily stored on the metaestimator. We need to override __deepcopy__ - # because the sub-estimators are probably cloned, which would result in a - # new copy of the list, but we need copy and deep copy both to return the - # same instance. - def __deepcopy__(self, memo): - return self - - def __copy__(self): - return self - - -class ConsumingRegressor(RegressorMixin, BaseEstimator): - """A regressor consuming metadata. - - Parameters - ---------- - registry : list, default=None - If a list, the estimator will append itself to the list in order to have - a reference to the estimator later on. Since that reference is not - required in all tests, registration can be skipped by leaving this value - as None. - - """ - - def __init__(self, registry=None): - self.registry = registry - - def partial_fit(self, X, y, sample_weight="default", metadata="default"): - if self.registry is not None: - self.registry.append(self) - - record_metadata_not_default( - self, "partial_fit", sample_weight=sample_weight, metadata=metadata - ) - return self - - def fit(self, X, y, sample_weight="default", metadata="default"): - if self.registry is not None: - self.registry.append(self) - - record_metadata_not_default( - self, "fit", sample_weight=sample_weight, metadata=metadata - ) - return self - - def predict(self, X, sample_weight="default", metadata="default"): - pass # pragma: no cover - - # when needed, uncomment the implementation - # if self.registry is not None: - # self.registry.append(self) - - # record_metadata_not_default( - # self, "predict", sample_weight=sample_weight, metadata=metadata - # ) - # return np.zeros(shape=(len(X),)) - - -class ConsumingClassifier(ClassifierMixin, BaseEstimator): - """A classifier consuming metadata. - - Parameters - ---------- - registry : list, default=None - If a list, the estimator will append itself to the list in order to have - a reference to the estimator later on. Since that reference is not - required in all tests, registration can be skipped by leaving this value - as None. - - """ - - def __init__(self, registry=None): - self.registry = registry - - def partial_fit(self, X, y, sample_weight="default", metadata="default"): - if self.registry is not None: - self.registry.append(self) - - record_metadata_not_default( - self, "partial_fit", sample_weight=sample_weight, metadata=metadata - ) - self.classes_ = [0, 1] - return self - - def fit(self, X, y, sample_weight="default", metadata="default"): - if self.registry is not None: - self.registry.append(self) - - record_metadata_not_default( - self, "fit", sample_weight=sample_weight, metadata=metadata - ) - self.classes_ = [0, 1] - return self - - def predict(self, X, sample_weight="default", metadata="default"): - pass # pragma: no cover - - # when needed, uncomment the implementation - # if self.registry is not None: - # self.registry.append(self) - - # record_metadata_not_default( - # self, "predict", sample_weight=sample_weight, metadata=metadata - # ) - # return np.zeros(shape=(len(X),)) - - def predict_proba(self, X, sample_weight="default", metadata="default"): - if self.registry is not None: - self.registry.append(self) - - record_metadata_not_default( - self, "predict_proba", sample_weight=sample_weight, metadata=metadata - ) - return np.asarray([[0.0, 1.0]] * len(X)) - - def predict_log_proba(self, X, sample_weight="default", metadata="default"): - pass # pragma: no cover - - # when needed, uncomment the implementation - # if self.registry is not None: - # self.registry.append(self) - - # record_metadata_not_default( - # self, "predict_log_proba", sample_weight=sample_weight, metadata=metadata - # ) - # return np.zeros(shape=(len(X), 2)) - - -class ConsumingScorer(_BaseScorer): - def __init__(self, registry=None): - super().__init__(score_func="test", sign=1, kwargs={}) - self.registry = registry - - def __call__( - self, estimator, X, y_true, sample_weight="default", metadata="default" - ): - if self.registry is not None: - self.registry.append(self) - - record_metadata_not_default( - self, "score", sample_weight=sample_weight, metadata=metadata - ) - - return 0.0 - - -class ConsumingSplitter(BaseCrossValidator, GroupsConsumerMixin): - def __init__(self, registry=None): - self.registry = registry - - def split(self, X, y=None, groups="default"): - if self.registry is not None: - self.registry.append(self) - - record_metadata_not_default(self, "split", groups=groups) - - split_index = len(X) - 10 - train_indices = range(0, split_index) - test_indices = range(split_index, len(X)) - yield test_indices, train_indices - - def get_n_splits(self, X=None, y=None, groups=None): - pass # pragma: no cover - - def _iter_test_indices(self, X=None, y=None, groups=None): - pass # pragma: no cover - - METAESTIMATORS: list = [ { "metaestimator": MultiOutputRegressor, @@ -279,7 +105,7 @@ def _iter_test_indices(self, X=None, y=None, groups=None): # ids used for pytest fixture METAESTIMATOR_IDS = [str(row["metaestimator"].__name__) for row in METAESTIMATORS] -CV_SCORERS = [ +CV_SCORERS: list = [ { "cv_estimator": LogisticRegressionCV, "scorer_name": "scoring", @@ -287,7 +113,7 @@ def _iter_test_indices(self, X=None, y=None, groups=None): }, ] -CV_SPLITTERS = [ +CV_SPLITTERS: list = [ { "cv_estimator": LogisticRegressionCV, "splitter_name": "cv", @@ -295,6 +121,10 @@ def _iter_test_indices(self, X=None, y=None, groups=None): } ] +# IDs used by pytest to get meaningful verbose messages when running the tests +CV_SCORER_IDS = [x["cv_estimator"].__name__ for x in CV_SCORERS] +CV_SPLITTER_IDS = [x["cv_estimator"].__name__ for x in CV_SPLITTERS] + def test_registry_copy(): # test that _Registry is not copied into a new instance. @@ -390,7 +220,7 @@ def set_request(estimator, method_name): check_recorded_metadata(estimator, method_name, **kwargs) -@pytest.mark.parametrize("cv_scorer", CV_SCORERS) +@pytest.mark.parametrize("cv_scorer", CV_SCORERS, ids=CV_SCORER_IDS) def test_metadata_is_routed_correctly_to_scorer(cv_scorer): """Test that any requested metadata is correctly routed to the underlying scorers in CV estimators. @@ -406,6 +236,8 @@ def test_metadata_is_routed_correctly_to_scorer(cv_scorer): instance = cls(**{scorer_name: scorer}) method = getattr(instance, method_name) kwargs = {"sample_weight": sample_weight} + if "fit" not in method_name: # instance needs to be fitted first + instance.fit(X, y) method(X, y, **kwargs) for _scorer in registry: check_recorded_metadata( @@ -416,7 +248,7 @@ def test_metadata_is_routed_correctly_to_scorer(cv_scorer): ) -@pytest.mark.parametrize("cv_splitter", CV_SPLITTERS) +@pytest.mark.parametrize("cv_splitter", CV_SPLITTERS, ids=CV_SPLITTER_IDS) def test_metadata_is_routed_correctly_to_splitter(cv_splitter): """Test that any requested metadata is correctly routed to the underlying splitters in CV estimators. diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index c4e565e13aae1..793e5793aec3f 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -17,7 +17,11 @@ from sklearn.datasets import load_iris from sklearn.decomposition import PCA, TruncatedSVD from sklearn.dummy import DummyRegressor -from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + RandomForestClassifier, + RandomTreesEmbedding, +) from sklearn.exceptions import NotFittedError from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_selection import SelectKBest, f_classif @@ -27,7 +31,7 @@ from sklearn.model_selection import train_test_split from sklearn.neighbors import LocalOutlierFactor from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import FunctionTransformer, StandardScaler from sklearn.svm import SVC from sklearn.utils._metadata_requests import COMPOSITE_METHODS, METHODS from sklearn.utils._testing import ( @@ -1828,5 +1832,26 @@ def test_routing_passed_metadata_not_supported(method): getattr(pipe, method)([[1]], sample_weight=[1], prop="a") +@pytest.mark.usefixtures("enable_slep006") +def test_pipeline_with_estimator_with_len(): + """Test that pipeline works with estimators that have a `__len__` method.""" + pipe = Pipeline( + [("trs", RandomTreesEmbedding()), ("estimator", RandomForestClassifier())] + ) + pipe.fit([[1]], [1]) + pipe.predict([[1]]) + + +@pytest.mark.usefixtures("enable_slep006") +@pytest.mark.parametrize("last_step", [None, "passthrough"]) +def test_pipeline_with_no_last_step(last_step): + """Test that the pipeline works when there is not last step. + + It should just ignore and pass through the data on transform. + """ + pipe = Pipeline([("trs", FunctionTransformer()), ("estimator", last_step)]) + assert pipe.fit([[1]], [1]).transform([[1], [2], [3]]) == [[1], [2], [3]] + + # End of routing tests # ==================== diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 7482fd6022e50..26267a1355f6f 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -193,7 +193,7 @@ def _support_missing_values(self, X): and self.monotonic_cst is None ) - def _compute_missing_values_in_feature_mask(self, X): + def _compute_missing_values_in_feature_mask(self, X, estimator_name=None): """Return boolean mask denoting if there are missing values for each feature. This method also ensures that X is finite. @@ -203,13 +203,17 @@ def _compute_missing_values_in_feature_mask(self, X): X : array-like of shape (n_samples, n_features), dtype=DOUBLE Input data. + estimator_name : str or None, default=None + Name to use when raising an error. Defaults to the class name. + Returns ------- missing_values_in_feature_mask : ndarray of shape (n_features,), or None Missing value mask. If missing values are not supported or there are no missing values, return None. """ - common_kwargs = dict(estimator_name=self.__class__.__name__, input_name="X") + estimator_name = estimator_name or self.__class__.__name__ + common_kwargs = dict(estimator_name=estimator_name, input_name="X") if not self._support_missing_values(X): assert_all_finite(X, **common_kwargs) diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index 4a23f4d2da946..b43ce1712709d 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -85,7 +85,7 @@ def __repr__(self): "decision_tree": [DecisionTreeClassifier, DecisionTreeRegressor], "max_depth": [Interval(Integral, 0, None, closed="left"), None], "feature_names": [list, None], - "class_names": [list, None], + "class_names": ["array-like", "boolean", None], "label": [StrOptions({"all", "root", "none"})], "filled": ["boolean"], "impurity": ["boolean"], @@ -140,7 +140,7 @@ def plot_tree( Names of each of the features. If None, generic names will be used ("x[0]", "x[1]", ...). - class_names : list of str or bool, default=None + class_names : array-like of str or True, default=None Names of each of the target classes in ascending numerical order. Only relevant for classification and not supported for multi-output. If ``True``, shows a symbolic representation of the class name. diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 87173032a3bd3..f8b4d2042223c 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -188,7 +188,7 @@ def _array_indexing(array, key, key_dtype, axis): key = np.asarray(key) if isinstance(key, tuple): key = list(key) - return array[key] if axis == 0 else array[:, key] + return array[key, ...] if axis == 0 else array[:, key] def _pandas_indexing(X, key, key_dtype, axis): diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index ca0d9fcaf1509..ed16ce767a0cd 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -186,6 +186,9 @@ def __init__(self, array_namespace): def __getattr__(self, name): return getattr(self._namespace, name) + def __eq__(self, other): + return self._namespace == other._namespace + def take(self, X, indices, *, axis=0): # When array_api supports `take` we can use this directly # https://github.com/data-apis/array-api/issues/177 diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index 5affa4616be01..fb3912b27dbfe 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -177,7 +177,7 @@ def _unique_python(values, *, return_inverse, return_counts): except TypeError: types = sorted(t.__qualname__ for t in set(type(v) for v in values)) raise TypeError( - "Encoders require their input to be uniformly " + "Encoders require their input argument must be uniformly " f"strings or numbers. Got {types}" ) ret = (uniques,) diff --git a/sklearn/utils/_estimator_html_repr.py b/sklearn/utils/_estimator_html_repr.py index e9b95666cdd32..207096823cae6 100644 --- a/sklearn/utils/_estimator_html_repr.py +++ b/sklearn/utils/_estimator_html_repr.py @@ -190,13 +190,35 @@ def _write_estimator_html( _STYLE = """ #$id { - color: black; + --sklearn-color-text: black; + --sklearn-color-line: gray; + --sklearn-color-background: white; + --sklearn-color-background-box: #f0f8ff; + --sklearn-color-border-box: black; + --sklearn-color-icon: #696969; + --sklearn-color-active: #d4ebff; + --sklearn-color-highlight: #d4ebff; + + @media (prefers-color-scheme: dark) { + --sklearn-color-text: white; + --sklearn-color-line: gray; + --sklearn-color-background: #111; + --sklearn-color-background-box: #424242; + --sklearn-color-border-box: white; + --sklearn-color-icon: #878787; + --sklearn-color-active: #616161; + --sklearn-color-highlight: #616161; + } +} + +#$id { + color: var(--sklearn-color-text); } #$id pre{ padding: 0; } #$id div.sk-toggleable { - background-color: white; + background-color: var(--sklearn-color-background); } #$id label.sk-toggleable__label { cursor: pointer; @@ -211,26 +233,26 @@ def _write_estimator_html( content: "▸"; float: left; margin-right: 0.25em; - color: #696969; + color: var(--sklearn-color-icon); } #$id label.sk-toggleable__label-arrow:hover:before { - color: black; + color: var(--sklearn-color-text); } #$id div.sk-estimator:hover label.sk-toggleable__label-arrow:before { - color: black; + color: var(--sklearn-color-text); } #$id div.sk-toggleable__content { max-height: 0; max-width: 0; overflow: hidden; text-align: left; - background-color: #f0f8ff; + background-color: var(--sklearn-color-background-box); } #$id div.sk-toggleable__content pre { margin: 0.2em; - color: black; + color: var(--sklearn-color-text); border-radius: 0.25em; - background-color: #f0f8ff; + background-color: var(--sklearn-color-background-box); } #$id input.sk-toggleable__control:checked~div.sk-toggleable__content { max-height: 200px; @@ -241,10 +263,10 @@ def _write_estimator_html( content: "▾"; } #$id div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label { - background-color: #d4ebff; + background-color: var(--sklearn-color-active); } #$id div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label { - background-color: #d4ebff; + background-color: var(--sklearn-color-active); } #$id input.sk-hidden--visually { border: 0; @@ -259,28 +281,28 @@ def _write_estimator_html( } #$id div.sk-estimator { font-family: monospace; - background-color: #f0f8ff; - border: 1px dotted black; + background-color: var(--sklearn-color-background-box); + border: 1px dotted var(--sklearn-color-border-box); border-radius: 0.25em; box-sizing: border-box; margin-bottom: 0.5em; } #$id div.sk-estimator:hover { - background-color: #d4ebff; + background-color: var(--sklearn-color-highlight); } #$id div.sk-parallel-item::after { content: ""; width: 100%; - border-bottom: 1px solid gray; + border-bottom: 1px solid var(--sklearn-color-line); flex-grow: 1; } #$id div.sk-label:hover label.sk-toggleable__label { - background-color: #d4ebff; + background-color: var(--sklearn-color-highlight); } #$id div.sk-serial::before { content: ""; position: absolute; - border-left: 1px solid gray; + border-left: 1px solid var(--sklearn-color-line); box-sizing: border-box; top: 0; bottom: 0; @@ -291,7 +313,7 @@ def _write_estimator_html( display: flex; flex-direction: column; align-items: center; - background-color: white; + background-color: var(--sklearn-color-background); padding-right: 0.2em; padding-left: 0.2em; position: relative; @@ -304,13 +326,13 @@ def _write_estimator_html( display: flex; align-items: stretch; justify-content: center; - background-color: white; + background-color: var(--sklearn-color-background); position: relative; } #$id div.sk-item::before, #$id div.sk-parallel-item::before { content: ""; position: absolute; - border-left: 1px solid gray; + border-left: 1px solid var(--sklearn-color-line); box-sizing: border-box; top: 0; bottom: 0; @@ -322,7 +344,7 @@ def _write_estimator_html( flex-direction: column; z-index: 1; position: relative; - background-color: white; + background-color: var(--sklearn-color-background); } #$id div.sk-parallel-item:first-child::after { align-self: flex-end; @@ -336,11 +358,11 @@ def _write_estimator_html( width: 0; } #$id div.sk-dashed-wrapped { - border: 1px dashed gray; + border: 1px dashed var(--sklearn-color-line); margin: 0 0.4em 0.5em 0.4em; box-sizing: border-box; padding-bottom: 0.4em; - background-color: white; + background-color: var(--sklearn-color-background); } #$id div.sk-label label { font-family: monospace; diff --git a/sklearn/utils/_metadata_requests.py b/sklearn/utils/_metadata_requests.py index 17d8e37510e48..1a9c07438b17a 100644 --- a/sklearn/utils/_metadata_requests.py +++ b/sklearn/utils/_metadata_requests.py @@ -80,7 +80,7 @@ import inspect from collections import namedtuple from copy import deepcopy -from typing import Optional, Union +from typing import TYPE_CHECKING, Optional, Union from warnings import warn from .. import get_config @@ -89,6 +89,9 @@ # Only the following methods are supported in the routing mechanism. Adding new # methods at the moment involves monkeypatching this list. +# Note that if this list is changed or monkeypatched, the corresponding method +# needs to be added under a TYPE_CHECKING condition like the one done here in +# _MetadataRequester SIMPLE_METHODS = [ "fit", "partial_fit", @@ -1251,6 +1254,27 @@ class _MetadataRequester: .. versionadded:: 1.3 """ + if TYPE_CHECKING: # pragma: no cover + # This code is never run in runtime, but it's here for type checking. + # Type checkers fail to understand that the `set_{method}_request` + # methods are dynamically generated, and they complain that they are + # not defined. We define them here to make type checkers happy. + # During type checking analyzers assume this to be True. + # The following list of defined methods mirrors the list of methods + # in SIMPLE_METHODS. + # fmt: off + def set_fit_request(self, **kwargs): pass + def set_partial_fit_request(self, **kwargs): pass + def set_predict_request(self, **kwargs): pass + def set_predict_proba_request(self, **kwargs): pass + def set_predict_log_proba_request(self, **kwargs): pass + def set_decision_function_request(self, **kwargs): pass + def set_score_request(self, **kwargs): pass + def set_split_request(self, **kwargs): pass + def set_transform_request(self, **kwargs): pass + def set_inverse_transform_request(self, **kwargs): pass + # fmt: on + def __init_subclass__(cls, **kwargs): """Set the ``set_{method}_request`` methods. @@ -1412,7 +1436,11 @@ def get_metadata_routing(self): # given metadata. This is to minimize the boilerplate required in routers. -def process_routing(obj, method, other_params, **kwargs): +# Here the first two arguments are positional only which makes everything +# passed as keyword argument a metadata. The first two args also have an `_` +# prefix to reduce the chances of name collisions with the passed metadata, and +# since they're positional only, users will never type those underscores. +def process_routing(_obj, _method, /, **kwargs): """Validate and route input parameters. This function is used inside a router's method, e.g. :term:`fit`, @@ -1420,26 +1448,21 @@ def process_routing(obj, method, other_params, **kwargs): Assuming this signature: ``fit(self, X, y, sample_weight=None, **fit_params)``, a call to this function would be: - ``process_routing(self, fit_params, sample_weight=sample_weight)``. + ``process_routing(self, sample_weight=sample_weight, **fit_params)``. .. versionadded:: 1.3 Parameters ---------- - obj : object + _obj : object An object implementing ``get_metadata_routing``. Typically a meta-estimator. - method : str + _method : str The name of the router's method in which this function is called. - other_params : dict - A dictionary of extra parameters passed to the router's method, - e.g. ``**fit_params`` passed to a meta-estimator's :term:`fit`. - **kwargs : dict - Parameters explicitly accepted and included in the router's method - signature. + Metadata to be routed. Returns ------- @@ -1449,27 +1472,20 @@ def process_routing(obj, method, other_params, **kwargs): corresponding methods or corresponding child objects. The object names are those defined in `obj.get_metadata_routing()`. """ - if not hasattr(obj, "get_metadata_routing"): + if not (hasattr(_obj, "get_metadata_routing") or isinstance(_obj, MetadataRouter)): raise AttributeError( - f"This {repr(obj.__class__.__name__)} has not implemented the routing" - " method `get_metadata_routing`." + f"The given object ({repr(_obj.__class__.__name__)}) needs to either" + " implement the routing method `get_metadata_routing` or be a" + " `MetadataRouter` instance." ) - if method not in METHODS: + if _method not in METHODS: raise TypeError( f"Can only route and process input on these methods: {METHODS}, " - f"while the passed method is: {method}." + f"while the passed method is: {_method}." ) - # We take the extra params (**fit_params) which is passed as `other_params` - # and add the explicitly passed parameters (passed as **kwargs) to it. This - # is equivalent to a code such as this in a router: - # if sample_weight is not None: - # fit_params["sample_weight"] = sample_weight - all_params = other_params if other_params is not None else dict() - all_params.update(kwargs) - - request_routing = get_routing_for_object(obj) - request_routing.validate_metadata(params=all_params, method=method) - routed_params = request_routing.route_params(params=all_params, caller=method) + request_routing = get_routing_for_object(_obj) + request_routing.validate_metadata(params=kwargs, method=_method) + routed_params = request_routing.route_params(params=kwargs, caller=_method) return routed_params diff --git a/sklearn/utils/_set_output.py b/sklearn/utils/_set_output.py index 9eee7c370e341..bb289535c45ec 100644 --- a/sklearn/utils/_set_output.py +++ b/sklearn/utils/_set_output.py @@ -5,6 +5,7 @@ from .._config import get_config from . import check_pandas_support from ._available_if import available_if +from .validation import _is_pandas_df def _wrap_in_pandas_container( @@ -125,9 +126,10 @@ def _wrap_data_with_container(method, data_to_wrap, original_input, estimator): return data_to_wrap # dense_config == "pandas" + index = original_input.index if _is_pandas_df(original_input) else None return _wrap_in_pandas_container( data_to_wrap=data_to_wrap, - index=getattr(original_input, "index", None), + index=index, columns=estimator.get_feature_names_out, ) diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 8b54df9f25b72..bf558ff7e6dd4 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -38,6 +38,7 @@ assert_array_almost_equal, assert_array_equal, assert_array_less, + assert_no_warnings, ) import sklearn @@ -65,6 +66,7 @@ "assert_approx_equal", "assert_allclose", "assert_run_python_script", + "assert_no_warnings", "SkipTest", ] @@ -80,32 +82,6 @@ assert_raises_regexp = assert_raises_regex -# To remove when we support numpy 1.7 -def assert_no_warnings(func, *args, **kw): - """ - Parameters - ---------- - func - *args - **kw - """ - # very important to avoid uncontrolled state propagation - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - - result = func(*args, **kw) - if hasattr(np, "FutureWarning"): - # Filter out numpy-specific warnings in numpy >= 1.9 - w = [e for e in w if e.category is not np.VisibleDeprecationWarning] - - if len(w) > 0: - raise AssertionError( - "Got warnings when calling %s: [%s]" - % (func.__name__, ", ".join(str(warning) for warning in w)) - ) - return result - - def ignore_warnings(obj=None, category=Warning): """Context manager and decorator to ignore warnings. diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index e77197e24a69e..53ae056b4d2f7 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1350,7 +1350,10 @@ def check_dtype_object(name, estimator_orig): if "string" not in tags["X_types"]: X[0, 0] = {"foo": "bar"} - msg = "argument must be a string.* number" + # This error is raised by: + # - `np.asarray` in `check_array` + # - `_unique_python` for encoders + msg = "argument must be .* string.* number" with raises(TypeError, match=msg): estimator.fit(X, y) else: @@ -3542,7 +3545,6 @@ def _enforce_estimator_tags_y(estimator, y): # Create strictly positive y. The minimal increment above 0 is 1, as # y could be of integer dtype. y += 1 + abs(y.min()) - # Estimators with a `binary_only` tag only accept up to two unique y values if _safe_tags(estimator, key="binary_only") and y.size > 0: y = np.where(y == y.flat[0], y, y.flat[0] + 1) # Estimators in mono_output_task_error raise ValueError if y is of 1-D @@ -3562,7 +3564,8 @@ def _enforce_estimator_tags_X(estimator, X, kernel=linear_kernel): if _safe_tags(estimator, key="requires_positive_X"): X = X - X.min() if "categorical" in _safe_tags(estimator, key="X_types"): - X = (X - X.min()).astype(np.int32) + dtype = np.float64 if _safe_tags(estimator, key="allow_nan") else np.int32 + X = np.round((X - X.min())).astype(dtype) if estimator.__class__.__name__ == "SkewedChi2Sampler": # SkewedChi2Sampler requires X > -skewdness in transform @@ -4584,7 +4587,7 @@ def check_set_output_transform_pandas(name, transformer_orig): outputs_pandas = _output_from_fit_transform(transformer_pandas, name, X, df, y) except ValueError as e: # transformer does not support sparse data - assert str(e) == "Pandas output does not support sparse data.", e + assert "Pandas output does not support sparse data." in str(e), e return for case in outputs_default: @@ -4630,7 +4633,7 @@ def check_global_output_transform_pandas(name, transformer_orig): ) except ValueError as e: # transformer does not support sparse data - assert str(e) == "Pandas output does not support sparse data.", e + assert "Pandas output does not support sparse data." in str(e), e return for case in outputs_default: diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 2202a1daaf90a..d33b638358157 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -158,3 +158,11 @@ def _contents(data_module): ) else: return resources.contents(data_module) + + +# For +1.25 NumPy versions exceptions and warnings are being moved +# to a dedicated submodule. +if np_version >= parse_version("1.25.0"): + from numpy.exceptions import VisibleDeprecationWarning +else: + from numpy import VisibleDeprecationWarning # type: ignore # noqa diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 892d77c7e01e5..1f46f6400df98 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -14,6 +14,7 @@ from scipy.sparse import issparse from ..utils._array_api import get_namespace +from ..utils.fixes import VisibleDeprecationWarning from .validation import _assert_all_finite, check_array @@ -161,10 +162,10 @@ def is_multilabel(y): ensure_min_features=0, ) with warnings.catch_warnings(): - warnings.simplefilter("error", np.VisibleDeprecationWarning) + warnings.simplefilter("error", VisibleDeprecationWarning) try: y = check_array(y, dtype=None, **check_y_kwargs) - except (np.VisibleDeprecationWarning, ValueError) as e: + except (VisibleDeprecationWarning, ValueError) as e: if str(e).startswith("Complex data not supported"): raise @@ -324,11 +325,11 @@ def type_of_target(y, input_name=""): ) with warnings.catch_warnings(): - warnings.simplefilter("error", np.VisibleDeprecationWarning) + warnings.simplefilter("error", VisibleDeprecationWarning) if not issparse(y): try: y = check_array(y, dtype=None, **check_y_kwargs) - except (np.VisibleDeprecationWarning, ValueError) as e: + except (VisibleDeprecationWarning, ValueError) as e: if str(e).startswith("Complex data not supported"): raise diff --git a/sklearn/utils/tests/test_estimator_html_repr.py b/sklearn/utils/tests/test_estimator_html_repr.py index e4327dcbc2c46..bbe44ac8974fa 100644 --- a/sklearn/utils/tests/test_estimator_html_repr.py +++ b/sklearn/utils/tests/test_estimator_html_repr.py @@ -197,6 +197,9 @@ def test_estimator_html_repr_pipeline(): assert f"" in html_output assert f"
{html.escape(str(est))}
" in html_output + # verify that prefers-color-scheme is implemented + assert "prefers-color-scheme" in html_output + @pytest.mark.parametrize("final_estimator", [None, LinearSVC()]) def test_stacking_classifier(final_estimator): diff --git a/sklearn/utils/tests/test_pprint.py b/sklearn/utils/tests/test_pprint.py index a4aaa8f21b6b7..ec48c4a012574 100644 --- a/sklearn/utils/tests/test_pprint.py +++ b/sklearn/utils/tests/test_pprint.py @@ -12,7 +12,7 @@ # Ignore flake8 (lots of line too long issues) -# flake8: noqa +# ruff: noqa # Constructors excerpted to test pprinting diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py index 403a5db63ec54..d1722a1553f9c 100644 --- a/sklearn/utils/tests/test_set_output.py +++ b/sklearn/utils/tests/test_set_output.py @@ -315,3 +315,32 @@ def test_set_output_named_tuple_out(): assert isinstance(X_trans, Output) assert_array_equal(X_trans.X, X) assert_array_equal(X_trans.Y, 2 * X) + + +class EstimatorWithListInput(_SetOutputMixin): + def fit(self, X, y=None): + assert isinstance(X, list) + self.n_features_in_ = len(X[0]) + return self + + def transform(self, X, y=None): + return X + + def get_feature_names_out(self, input_features=None): + return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object) + + +def test_set_output_list_input(): + """Check set_output for list input. + + Non-regression test for #27037. + """ + pd = pytest.importorskip("pandas") + + X = [[0, 1, 2, 3], [4, 5, 6, 7]] + est = EstimatorWithListInput() + est.set_output(transform="pandas") + + X_out = est.fit(X).transform(X) + assert isinstance(X_out, pd.DataFrame) + assert_array_equal(X_out.columns, ["X0", "X1", "X2", "X3"])