From 0ac8a71ee2ad670d7b3a39201f63e1b48938204c Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 28 Mar 2023 19:58:02 -0700
Subject: [PATCH 01/39] [EXAMPLE DIFF] (Tree featuresv2) Fork of sklearn that
 maintains all necessary refactorings to enable downstream functionality 
 (#32)

#### Reference Issues/PRs
This is the most up-to-date PR branch to consolidate all proposed
refactor changes that work with:

- unsupervised trees
- oblique trees
- no performance/runtime regressions against main

#### What does this implement/fix? Explain your changes.
Incorporates refactors to:

Internal Cython of scikit-learn's:
- criterion
- splitter
- tree

Internals of Python in scikit-learns:
- python Tree

Adds the basic implementation of oblique trees. The implementation of
oblique trees has been tested on all sklearn's `check_estimator` testing
function and has error-checking bounds for the new hyperparameter
introduced, which is `feature_combinations` that defaults to ``min(1.5,
n_features)``.

TODO:
1. [ ] ~Add honest support for trees (splitting the data at the Python
API level)~
2. [x] Build wheels
3. [ ] ~Brainstorm unit-tests, or weekly checks to determine when our
fork is out-of-date compared to upstream sklearn~
4. [x] Revamp README for the fork

#### Any other comments?

[cd build]

---------

Signed-off-by: Adam Li <adam2392@gmail.com>
Co-authored-by: Chester Huynh <chester.huynh924@gmail.com>
Co-authored-by: Parth Vora <pvora4@jhu.edu>
---
 .circleci/config.yml                        |  33 +-
 .cirrus.star                                |   4 +-
 .github/workflows/check-changelog.yml       |   3 +-
 .github/workflows/check-manifest.yml        |   2 +-
 .github/workflows/labeler-module.yml        |   4 +-
 .github/workflows/update_tracking_issue.yml |   2 +-
 .github/workflows/wheels.yml                |  33 +-
 .gitignore                                  |   1 +
 Makefile                                    |   3 +
 README.rst                                  | 322 ++++++----
 build_tools/azure/install.sh                |   2 +-
 build_tools/azure/install_win.sh            |   2 +-
 doc/Makefile                                |   2 +
 doc/conf.py                                 |   3 +-
 doc/modules/tree.rst                        |  61 +-
 examples/tree/plot_iris_dtc.py              |   4 -
 setup.py                                    |  45 +-
 sklearn/ensemble/_forest.py                 | 108 +++-
 sklearn/ensemble/tests/test_forest.py       | 171 +++++
 sklearn/tree/_classes.py                    | 162 +++--
 sklearn/tree/_criterion.pxd                 |  45 +-
 sklearn/tree/_criterion.pyx                 | 285 ++++-----
 sklearn/tree/_splitter.pxd                  |  41 +-
 sklearn/tree/_splitter.pyx                  | 165 +++--
 sklearn/tree/_tree.pxd                      |  90 ++-
 sklearn/tree/_tree.pyx                      | 659 ++++++++++++--------
 sklearn/tree/tests/test_tree.py             |  32 +-
 27 files changed, 1499 insertions(+), 785 deletions(-)
 mode change 100755 => 100644 setup.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index e2f54c0665c78..e4e66b5c57f49 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -94,22 +94,23 @@ jobs:
           root: doc/_build/html
           paths: .
 
-  deploy:
-    docker:
-      - image: cimg/python:3.8.12
-    steps:
-      - checkout
-      - run: ./build_tools/circle/checkout_merge_commit.sh
-      # Attach documentation generated in the 'doc' step so that it can be
-      # deployed.
-      - attach_workspace:
-          at: doc/_build/html
-      - run: ls -ltrh doc/_build/html/stable
-      - deploy:
-          command: |
-            if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then
-              bash build_tools/circle/push_doc.sh doc/_build/html/stable
-            fi
+  # XXX: in order to make sure our fork passes all the CIs and not remove too many LOC, we don't want to deploy
+  # deploy:
+  #   docker:
+  #     - image: cimg/python:3.8.12
+  #   steps:
+  #     - checkout
+  #     - run: ./build_tools/circle/checkout_merge_commit.sh
+  #     # Attach documentation generated in the 'doc' step so that it can be
+  #     # deployed.
+  #     - attach_workspace:
+  #         at: doc/_build/html
+  #     - run: ls -ltrh doc/_build/html/stable
+  #     - deploy:
+  #         command: |
+  #           if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then
+  #             bash build_tools/circle/push_doc.sh doc/_build/html/stable
+  #           fi
 
 workflows:
   version: 2
diff --git a/.cirrus.star b/.cirrus.star
index 8b3de0d10c532..2dd1e50144987 100644
--- a/.cirrus.star
+++ b/.cirrus.star
@@ -4,9 +4,9 @@
 load("cirrus", "env", "fs", "http")
 
 def main(ctx):
-    # Only run for scikit-learn/scikit-learn. For debugging on a fork, you can
+    # Only run for neurodata/scikit-learn. For debugging on a fork, you can
     # comment out the following condition.
-    if env.get("CIRRUS_REPO_FULL_NAME") != "scikit-learn/scikit-learn":
+    if env.get("CIRRUS_REPO_FULL_NAME") != "neurodata/scikit-learn":
         return []
 
     arm_wheel_yaml = "build_tools/cirrus/arm_wheel.yml"
diff --git a/.github/workflows/check-changelog.yml b/.github/workflows/check-changelog.yml
index d5bfc8ef0f430..53f64ba5c886b 100644
--- a/.github/workflows/check-changelog.yml
+++ b/.github/workflows/check-changelog.yml
@@ -10,12 +10,13 @@ jobs:
   check:
     name: A reviewer will let you know if it is required or can be bypassed
     runs-on: ubuntu-latest
-    if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 }}
+    if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 && github.repository == 'scikit-learn/scikit-learn' }}
     steps:
       - name: Get PR number and milestone
         run: |
           echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
           echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }}" >> $GITHUB_ENV
+          echo "${{ github.repository }}"
       - uses: actions/checkout@v3
         with:
           fetch-depth: '0'
diff --git a/.github/workflows/check-manifest.yml b/.github/workflows/check-manifest.yml
index 004cc452e385e..5ef9ce2213e90 100644
--- a/.github/workflows/check-manifest.yml
+++ b/.github/workflows/check-manifest.yml
@@ -7,7 +7,7 @@ on:
 jobs:
   check-manifest:
     # Don't run on forks
-    if: github.repository == 'scikit-learn/scikit-learn'
+    if: github.repository == 'neurodata/scikit-learn'
 
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/labeler-module.yml b/.github/workflows/labeler-module.yml
index 061d0094b38c5..8092711f07e45 100644
--- a/.github/workflows/labeler-module.yml
+++ b/.github/workflows/labeler-module.yml
@@ -16,7 +16,7 @@ jobs:
     steps:
     - uses: thomasjpfan/labeler@v2.5.0
       continue-on-error: true
-      if: github.repository == 'scikit-learn/scikit-learn'
+      if: github.repository == 'neurodata/scikit-learn'
       with:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"
         max-labels: "3"
@@ -27,7 +27,7 @@ jobs:
     steps:
     - uses: thomasjpfan/labeler@v2.5.0
       continue-on-error: true
-      if: github.repository == 'scikit-learn/scikit-learn'
+      if: github.repository == 'neurodata/scikit-learn'
       with:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"
         configuration-path: ".github/labeler-file-extensions.yml"
diff --git a/.github/workflows/update_tracking_issue.yml b/.github/workflows/update_tracking_issue.yml
index 124ea1e8c6ac4..c176ce356a4cf 100644
--- a/.github/workflows/update_tracking_issue.yml
+++ b/.github/workflows/update_tracking_issue.yml
@@ -24,7 +24,7 @@ on:
 jobs:
   update_tracking_issue:
     runs-on: ubuntu-latest
-    if: github.repository == 'scikit-learn/scikit-learn' && github.event_name == 'schedule'
+    if: github.repository == 'neurodata/scikit-learn' && github.event_name == 'schedule'
     steps:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index b43f29ffa4f7f..4ab75fd361586 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -7,12 +7,12 @@ on:
     - cron: "42 3 */1 * *"
   push:
     branches:
-      - main
+      - fork
       # Release branches
       - "[0-9]+.[0-9]+.X"
   pull_request:
     branches:
-      - main
+      - fork
       - "[0-9]+.[0-9]+.X"
   # Manual run
   workflow_dispatch:
@@ -26,7 +26,7 @@ jobs:
   check_build_trigger:
     name: Check build trigger
     runs-on: ubuntu-latest
-    if: github.repository == 'scikit-learn/scikit-learn'
+    if: github.repository == 'neurodata/scikit-learn'
     outputs:
       build: ${{ steps.check_build_trigger.outputs.build }}
 
@@ -178,31 +178,8 @@ jobs:
         with:
           path: dist/*.tar.gz
 
-  # Upload the wheels and the source distribution
-  upload_anaconda:
-    name: Upload to Anaconda
-    runs-on: ubuntu-latest
-    needs: [build_wheels, build_sdist]
-    # The artifacts cannot be uploaded on PRs
-    if: github.event_name != 'pull_request'
-
-    steps:
-      - name: Checkout scikit-learn
-        uses: actions/checkout@v3
-
-      - name: Download artifacts
-        uses: actions/download-artifact@v3
+      - uses: actions/upload-artifact@v3
         with:
           path: dist
+          name: ${{ matrix.python[0] }}-${{ matrix.os[1] }}
 
-      - name: Setup Python
-        uses: actions/setup-python@v4
-
-      - name: Upload artifacts
-        env:
-          # Secret variables need to be mapped to environment variables explicitly
-          SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN }}
-          SCIKIT_LEARN_STAGING_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_STAGING_UPLOAD_TOKEN }}
-          ARTIFACTS_PATH: dist/artifact
-        # Force a replacement if the remote file already exists
-        run: bash build_tools/github/upload_anaconda.sh
diff --git a/.gitignore b/.gitignore
index 89600846100a8..1e28896f50be6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@
 build
 sklearn/datasets/__config__.py
 sklearn/**/*.html
+scikit_learn_tree.egg-info/*
 
 dist/
 MANIFEST
diff --git a/Makefile b/Makefile
index 5ea64dc0d6cac..148027b30f59f 100644
--- a/Makefile
+++ b/Makefile
@@ -63,3 +63,6 @@ doc-noplot: inplace
 code-analysis:
 	flake8 sklearn | grep -v __init__ | grep -v external
 	pylint -E -i y sklearn/ -d E1103,E0611,E1101
+
+build-dev:
+	pip install --verbose --no-build-isolation --editable .
diff --git a/README.rst b/README.rst
index 5e2de6a6d8b46..fbdfdaa95ef4c 100644
--- a/README.rst
+++ b/README.rst
@@ -44,20 +44,36 @@
 .. |PytestMinVersion| replace:: 5.3.1
 .. |PlotlyMinVersion| replace:: 5.10.0
 
-.. image:: https://raw.githubusercontent.com/scikit-learn/scikit-learn/main/doc/logos/scikit-learn-logo.png
-  :target: https://scikit-learn.org/
+``scikit-learn-tree`` is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line
+with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is
+released under the name ``scikit-learn-tree`` to avoid confusion.
 
-**scikit-learn** is a Python module for machine learning built on top of
-SciPy and is distributed under the 3-Clause BSD license.
+It is currently maintained by a team of volunteers.
 
-The project was started in 2007 by David Cournapeau as a Google Summer
-of Code project, and since then many volunteers have contributed. See
-the `About us <https://scikit-learn.org/dev/about.html#authors>`__ page
-for a list of core contributors.
+The upstream package **scikit-learn** is a Python module for machine learning built on top of
+SciPy and is distributed under the 3-Clause BSD license. Refer to their website for all documentation
+needs: https://scikit-learn.org.
 
-It is currently maintained by a team of volunteers.
+Why a fork?
+-----------
+Currently, the scikit-learn tree submodule is difficult to extend. Requests to modularize
+and improve the extensibility of the code is currently unsupported, or may take a long time.
+The desire for advanced tree models that also leverage the robustness of scikit-learn is desirable.
+
+However, "hard-forking" via copy/pasting the explicit Python/Cython code into another tree package
+altogether is undesirable because it results in a tree codebase that is inherently different
+and not compatible with ``scikit-learn``. For example, `quantile-forests <https://github.com/zillow/quantile-forest>`_,
+and `EconML <https://github.com/py-why/EconML>`_ do this, and their current tree submodules
+cannot take advantage of improvements made in upstream ``scikit-learn``.
+
+An example of seamless integration would be `scikit-survival <https://github.com/sebp/scikit-survival>`_, which
+only needs to implement a subclass of the Cython ``Criterion`` oject in their code to enable survival trees.
 
-Website: https://scikit-learn.org
+Maintaining a "soft-fork" of ``scikit-learn`` in the form of a repository fork allows us to develop
+a separate package that serves as a stand-in for ``sklearn`` in any package, extends the tree submodule
+and can also be synced with upstream changes in ``scikit-learn``. This enables this fork to always
+take advantage of improvements made in ``scikit-learn`` main upstream, while providing a customizable
+tree API.
 
 Installation
 ------------
@@ -73,133 +89,195 @@ scikit-learn requires:
 - joblib (>= |JoblibMinVersion|)
 - threadpoolctl (>= |ThreadpoolctlMinVersion|)
 
-=======
+============================
+Installing scikit-learn-tree
+============================
 
-**Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4.**
-scikit-learn 1.0 and later require Python 3.7 or newer.
-scikit-learn 1.1 and later require Python 3.8 or newer.
+Scikit-learn-tree is a maintained fork of scikit-learn, which extends the
+tree submodule in a few ways documented in :ref:`changelog of the fork
+<fork-changelog>`. 
 
-Scikit-learn plotting capabilities (i.e., functions start with ``plot_`` and
-classes end with "Display") require Matplotlib (>= |MatplotlibMinVersion|).
-For running the examples Matplotlib >= |MatplotlibMinVersion| is required.
-A few examples require scikit-image >= |Scikit-ImageMinVersion|, a few examples
-require pandas >= |PandasMinVersion|, some examples require seaborn >=
-|SeabornMinVersion| and plotly >= |PlotlyMinVersion|.
+We release versions of scikit-learn-tree in an analagous fashion to
+scikit-learn main. Due to maintenance resources, we only release on PyPi
+and recommend therefore installing with ``pip``.
 
-User installation
-~~~~~~~~~~~~~~~~~
+There are different ways to install scikit-learn-tree:
 
-If you already have a working installation of numpy and scipy,
-the easiest way to install scikit-learn is using ``pip``::
+  * :ref:`Install the latest official release <install_fork_release>`. This
+    is the best approach for most users. It will provide a stable version
+    and pre-built packages are available for most platforms.
+    
+  * :ref:`Building the package from source
+    <install_source>`. This is best for users who want the
+    latest-and-greatest features and aren't afraid of running
+    brand-new code. This is also needed for users who wish to contribute to the
+    project.
 
-    pip install -U scikit-learn
+.. _install_fork_release:
 
-or ``conda``::
+Installing the latest release
+-----------------------------
+We release wheels for common distributions and this is thus installable via pip.
 
-    conda install -c conda-forge scikit-learn
+.. prompt:: bash $
+  
+  pip install scikit-learn-tree
 
-The documentation includes more detailed `installation instructions <https://scikit-learn.org/stable/install.html>`_.
+This will install ``scikit-learn-tree`` under the namespace of ``sklearn``, which then
+can be used as a stand-in for any package that relies on the public API of ``sklearn``.
 
+For example, any usage of ``scikit-learn`` is preserved with ``scikit-learn-tree``
 
-Changelog
----------
+  >>> # the sklearn installed is that of scikit-learn-tree and is equivalent to scikit-learn
+  >>> from sklearn.ensemble import RandomForestClassifier
+  >>> clf = RandomForestClassifier(random_state=0)
+  >>> X = [[ 1,  2,  3],  # 2 samples, 3 features
+  ...      [11, 12, 13]]
+  >>> y = [0, 1]  # classes of each sample
+  >>> clf.fit(X, y)
+  RandomForestClassifier(random_state=0)
 
-See the `changelog <https://scikit-learn.org/dev/whats_new.html>`__
-for a history of notable changes to scikit-learn.
+.. _install_source:
+
+Building from source
+--------------------
+If you are a developer and are interested in helping maintain, or add some new
+features to the fork, the building from source instructions are exactly the same
+as that of scikit-learn main, so please refer to `scikit-learn documentation <https://scikit-learn.org/stable/developers/advanced_installation.html#install-bleeding-edge>`_
+for instructions on building from source.
 
 Development
------------
+===========
 
-We welcome new contributors of all experience levels. The scikit-learn
-community goals are to be helpful, welcoming, and effective. The
+We welcome new contributors of all experience levels, specifically to maintain the fork.
+Any contributions that make sure our fork is "better in-line" with scikit-learn upstream,
+or improves the tree submodule in anyway will be appreciated.
+
+The scikit-learn community goals are to be helpful, welcoming, and effective. The
 `Development Guide <https://scikit-learn.org/stable/developers/index.html>`_
 has detailed information about contributing code, documentation, tests, and
 more. We've included some basic information in this README.
 
-Important links
-~~~~~~~~~~~~~~~
-
-- Official source code repo: https://github.com/scikit-learn/scikit-learn
-- Download releases: https://pypi.org/project/scikit-learn/
-- Issue tracker: https://github.com/scikit-learn/scikit-learn/issues
-
-Source code
-~~~~~~~~~~~
-
-You can check the latest sources with the command::
-
-    git clone https://github.com/scikit-learn/scikit-learn.git
-
-Contributing
-~~~~~~~~~~~~
-
-To learn more about making a contribution to scikit-learn, please see our
-`Contributing guide
-<https://scikit-learn.org/dev/developers/contributing.html>`_.
-
-Testing
-~~~~~~~
-
-After installation, you can launch the test suite from outside the source
-directory (you will need to have ``pytest`` >= |PyTestMinVersion| installed)::
-
-    pytest sklearn
-
-See the web page https://scikit-learn.org/dev/developers/contributing.html#testing-and-improving-test-coverage
-for more information.
-
-    Random number generation can be controlled during testing by setting
-    the ``SKLEARN_SEED`` environment variable.
-
-Submitting a Pull Request
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Before opening a Pull Request, have a look at the
-full Contributing page to make sure your code complies
-with our guidelines: https://scikit-learn.org/stable/developers/index.html
-
-Project History
----------------
-
-The project was started in 2007 by David Cournapeau as a Google Summer
-of Code project, and since then many volunteers have contributed. See
-the `About us <https://scikit-learn.org/dev/about.html#authors>`__ page
-for a list of core contributors.
-
-The project is currently maintained by a team of volunteers.
-
-**Note**: `scikit-learn` was previously referred to as `scikits.learn`.
-
-Help and Support
-----------------
-
-Documentation
-~~~~~~~~~~~~~
-
-- HTML documentation (stable release): https://scikit-learn.org
-- HTML documentation (development version): https://scikit-learn.org/dev/
-- FAQ: https://scikit-learn.org/stable/faq.html
-
-Communication
-~~~~~~~~~~~~~
-
-- Mailing list: https://mail.python.org/mailman/listinfo/scikit-learn
-- Gitter: https://gitter.im/scikit-learn/scikit-learn
-- Logos & Branding: https://github.com/scikit-learn/scikit-learn/tree/main/doc/logos
-- Blog: https://blog.scikit-learn.org
-- Calendar: https://blog.scikit-learn.org/calendar/
-- Twitter: https://twitter.com/scikit_learn
-- Twitter (commits): https://twitter.com/sklearn_commits
-- Stack Overflow: https://stackoverflow.com/questions/tagged/scikit-learn
-- Github Discussions: https://github.com/scikit-learn/scikit-learn/discussions
-- Website: https://scikit-learn.org
-- LinkedIn: https://www.linkedin.com/company/scikit-learn
-- YouTube: https://www.youtube.com/channel/UCJosFjYm0ZYVUARxuOZqnnw/playlists
-- Facebook: https://www.facebook.com/scikitlearnofficial/
-- Instagram: https://www.instagram.com/scikitlearnofficial/
-- TikTok: https://www.tiktok.com/@scikit.learn
-
-Citation
-~~~~~~~~
-
-If you use scikit-learn in a scientific publication, we would appreciate citations: https://scikit-learn.org/stable/about.html#citing-scikit-learn
+.. _fork-changelog:
+Major Changes of the Fork
+=========================
+
+The purpose of this page is to illustrate some of the main features that
+``scikit-learn-tree`` provides compared to ``scikit-learn``. It assumes a
+an understanding of core package ``scikit-learn`` and also decision trees
+models. Please refer to our :ref:`installation instructions
+<fork-installation-instructions>` for installing ``scikit-learn-tree``.
+
+Scikit-learn-tree though operates as a stand-in for upstream ``scikit-learn``.
+It is used in packages exactly the same way and will support all features
+in the corresponding version of ``scikit-learn``. For example, if you
+are interested in features of ``scikit-learn`` in v1.2.2 for ``NearestNeighbors`` algorithm,
+then if ``scikit-learn-tree`` has a version release of v1.2.2, then it will have
+all those features. 
+
+The breaking API changes will be with respect to anything in the ``tree`` submodule,
+and related Forest ensemble models. See below for a detailed list of breaking changes.
+
+See: https://scikit-learn.org/ for documentation on scikit-learn main.
+
+Our Philosophy
+--------------
+Our design philosophy with this fork of ``scikit-learn`` is to maintain as few changes
+as possible, such that incorporating upstream changes into the fork requires minimal effort.
+
+Candidate changes and PRs accepted into the fork are those that:
+
+- improve compatability with upstream ``scikit-learn`` main
+- enable improved extensibility of tree models
+
+Decision tree generalizations
+-----------------------------
+
+``Scikit-learn`` provides an axis-aligned :class:`~sklearn.tree.DecisionTreeClassifier`
+decision tree model (classifier and regressor), which has a few fundamental limitations
+that prevent 3rd parties from utilizing the existing class, without forking a large
+amount of copy/pasted Python and Cython code. We highlight those limitations here
+and then describe how we generalize that limitation.
+
+Cython Internal Private API:
+
+Note, the Cython API for scikit-learn is still not a publicly supported API, so it may
+change without warning.
+
+- leaf and split nodes: These nodes are treated the same way and there is no internal
+  API for setting them differently. Quantile trees and causal trees inherently generalize
+  how leaf nodes are set.
+- Criterion class: The criterion class currently assumes a supervised learning interface.
+  - Our fix: We implement a ``BaseCriterion`` object that provides an abstract API for unsupervised criterion.
+- Splitter class: The splitter clas currently assumes a supervised learning interface and
+  does not provide a way of generalizing the way split candidates are proposed.
+  - Our fix: We implement a ``BaseSplitter`` object that provides an abstract API for unsupervised splitters and also implement an API to allow generalizations of the ``SplitRecord`` struct and ``Splitter.node_split`` function. For example, this enables oblique splits to be considered.
+- Tree class: The tree class currently assumes a supervised learning interface and does not
+  provide a way of generalizing the type of tree.
+  - Our fix: We implementa ``BaseTree`` object that provides an abstract API for general tree models and also implement an API that allows generalization of the type of tree. For example, oblique trees are trivially implementable as an extension now.
+- stopping conditions for splitter: Currently, the ``Splitter.node_split`` function has various
+  stopping conditions for the splitter based on hyperparameters. It is plausible that these conditions
+  may be extended. For example, in causal trees, one may want the splitter to also account for
+  a minimal degree of heterogeneity (i.e. variance) in its children nodes. 
+
+Python API:
+
+- ``sklearn.tree.BaseDecisionTree`` assumes the underlying tree model is supervised: The ``y``
+  parameter is required to be passed in, which is not necessary for general tree-based models.
+  For example, an unsupervised tree may pass in ``y=None``.
+  - Our fix: We fix this API, so the ``BaseDecisionTree`` is subclassable by unsupervised tree models that do not require ``y`` to be defined.
+- ``sklearn.tree.BaseDecisionTree`` does not provide a way to generalize the ``Criterion``, ``Splitter``
+  and ``Tree`` Cython classes used: The current codebase requires users to define custom
+  criterion and/or splitters outside the instantiation of the ``BaseDecisionTree``. This prevents
+  users from generalizing the ``Criterion`` and ``Splitter`` and creating a neat Python API wrapper.
+  Moreover, the ``Tree`` class is not customizable.
+  - Our fix: We internally implement a private function to actually build the entire tree, ``BaseDecisionTree._build_tree``, which can be overridden in subclasses that customize the criterion, splitter, or tree, or any combination of them.
+- ``sklearn.ensemble.BaseForest`` and its subclass algorithms are slow when ``n_samples`` is very high. Binning
+  features into a histogram, which is the basis of "LightGBM" and "HistGradientBoostingClassifier" is a computational
+  trick that can both significantly increase runtime efficiency, but also help prevent overfitting in trees, since
+  the sorting in "BestSplitter" is done on bins rather than the continuous feature values. This would enable
+  random forests and their variants to scale to millions of samples.
+  - Our fix: We added a ``max_bins=None`` keyword argument to the ``BaseForest`` class, and all its subclasses. The default behavior is no binning. The current implementation is not necessarily efficient. There are several improvements to be made. See below.
+
+Overall, the existing tree models, such as :class:`~sklearn.tree.DecisionTreeClassifier`
+and :class:`~sklearn.ensemble.RandomForestClassifier` all work exactly the same as they
+would in ``scikit-learn`` main, but these extensions enable 3rd-party packages to extend
+the Cython/Python API easily.
+
+Roadmap
+-------
+There are several improvements that can be made in this fork. Primarily, the binning feature
+promises to make Random Forests and their variants ultra-fast. However, the binning needs
+to be implemented in a similar fashion to ``HistGradientBoostingClassifier``, which passes
+in the binning thresholds throughout the tree construction step, such that the split nodes
+store the actual numerical value of the bin rather than the "bin index". This requires
+modifying the tree Cython code to take in a ``binning_thresholds`` parameter that is part
+of the ``_BinMapper`` fitted class. This also allows us not to do any binning during prediction/apply
+time because the tree already stores the "numerical" threshold value we would want to apply
+to any incoming ``X`` that is not binned.
+
+Besides that modification, the tree and splitter need to be able to handle not just ``np.float32``
+data (the type for X normally in Random Forests), but also ``uint8`` data (the type for X when it
+is binned in to e.g. 255 bins). This would not only save RAM since ``uint8`` storage of millions
+of samples would result in many GB saved, but also improved runtime.
+
+So in summary, the Cython code of the tree submodule needs to take in an extra parameter for
+the binning thresholds if binning occurs and also be able to handle ``X`` being of dtype ``uint8``.
+Afterwards, Random Forests will have fully leveraged the binning feature.
+
+Something to keep in mind is that upstream scikit-learn is actively working on incorporating
+missing-value handling and categorical handling into Random Forests.
+
+Next steps
+----------
+
+We have briefly covered how the tree submodule has changed with respect to ``scikit-learn``.
+This enables packages to leverage these changes in developing more complex tree models
+that may, or may not eventually be PRed into ``scikit-learn``. For example,
+
+- `scikit-tree <https://docs.neurodata.io/scikit-tree/dev/index.html>`_ is a scikit-learn
+  compatible package for more complex and advanced tree models.
+
+If you are developing tree models, we encourage you to take a look at that package, or
+if you have suggestions to make the tree submodule of our fork, ``scikit-learn-tree``
+more 
\ No newline at end of file
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index 5238cd1121d2e..db5b5d9414053 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -7,7 +7,7 @@ set -x
 source build_tools/shared.sh
 
 UNAMESTR=`uname`
-CCACHE_LINKS_DIR="/tmp/ccache"
+CCACHE_LINKS_DIR="/tmp/ccachev2"
 
 setup_ccache() {
     CCACHE_BIN=`which ccache || echo ""`
diff --git a/build_tools/azure/install_win.sh b/build_tools/azure/install_win.sh
index ab559a1878971..011e962885d45 100755
--- a/build_tools/azure/install_win.sh
+++ b/build_tools/azure/install_win.sh
@@ -22,4 +22,4 @@ show_installed_libraries
 python setup.py bdist_wheel
 
 # Install the generated wheel package to test it
-pip install --pre --no-index --find-links dist scikit-learn
+pip install --pre --no-index --find-links dist scikit-learn-tree
diff --git a/doc/Makefile b/doc/Makefile
index b56a1289cd581..c728bbbfd033e 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -53,6 +53,8 @@ html:
 	@echo
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable"
 
+# rm $(BUILDDIR)/html/stable/index.html
+# mv $(BUILDDIR)/html/stable/fork_index.html $(BUILDDIR)/html/stable/index.html
 html-noplot:
 	$(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable
 	@echo
diff --git a/doc/conf.py b/doc/conf.py
index 52b084b331c8c..01e0a332dd54f 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -103,7 +103,8 @@
 # source_encoding = 'utf-8'
 
 # The main toctree document.
-root_doc = "contents"
+# root_doc = "contents"
+root_doc = "index"
 
 # General information about the project.
 project = "scikit-learn"
diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst
index 789b0bab616ca..7fa12fd16d487 100644
--- a/doc/modules/tree.rst
+++ b/doc/modules/tree.rst
@@ -141,7 +141,7 @@ Once trained, you can plot the tree with the :func:`plot_tree` function::
     >>> tree.plot_tree(clf)
     [...]
 
-.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_002.png
+.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_003.png
    :target: ../auto_examples/tree/plot_iris_dtc.html
    :scale: 75
    :align: center
@@ -331,6 +331,8 @@ total cost over the entire trees (by summing the cost at each node) of
 :math:`O(n_{features}n_{samples}^{2}\log(n_{samples}))`.
 
 
+.. _tree_tips_usage:
+
 Tips on practical use
 =====================
 
@@ -612,11 +614,66 @@ be pruned. This process stops when the pruned tree's minimal
 
     * :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py`
 
+Classification, regression and multi-output problems
+----------------------------------------------------
+
+OTs can be used for both classification and regression, and can handle multi-output
+problems in the same manner as DTs.
+
+Complexity
+----------
+
+The run time cost to construct an OT will be similar to that of a DT, with the
+added complexity of a (possibly sparse) matrix multiplication to combine random
+data columns into candidate split values. The cost at each node is
+:math:`O(n_{features}n_{samples}\log(n_{samples}) + n_{features}n_{samples}max\_features \lambda)`
+where the additional :math:`n_{features}n_{samples}max\_features \lambda` term
+comes from the (possibly sparse) matrix multiplication controlled by both the
+number of candidate splits to generate ("max_features") and the sparsity of
+the projection matrix that combines the data features (":math:`\lambda`").
+
+Another consideration is space-complexity.
+
+Space-complexity and storing the OT pickled on disc is also a consideration. OTs
+at every node need to store an additional vector of feature indices and vector of
+feature weights that are used together to form the candidate splits.
+
+Tips on practical use
+---------------------
+
+Similar to DTs, the intuition for most parameters are the same. Therefore refer
+to :ref:`tips for using decision trees <tree_tips_usage>` for information on standard
+tree parameters. Specific parameters, such as ``max_features`` and
+``feature_combinations`` are different or special to OTs. 
+
+  * As specified earlier, ``max_features`` is not constrained to ``n_features``
+    as it is in DTs. Setting ``max_features`` higher requires more computation time because
+    the algorithm needs to sample more candidate splits at every node. However, it also possibly
+    lets the user to sample more informative splits, thereby improving the model fit. This
+    presents a tradeoff between runtime resources and improvements to the model. In practice,
+    we found that sampling more splits, say up to ``max_features=n_features**2``, is desirable
+    if one is willing to spend the computational resources. 
+
+  * ``feature_combinations`` is the :math:`\lambda` term presented in the complexity
+    analysis, which specifies how sparse our combination of features is. If
+    ``feature_combinations=n_features``, then OT is the ``Forest-RC`` version. However,
+    in practice, ``feature_combinations`` can be set much lower, therefore improving runtime
+    and storage complexity.
+
+Finally, when asking the question of when to use OTs vs DTs, scikit-learn recommends
+always trying both model using some type of cross-validation procedure and hyperparameter
+optimization (e.g. `GridSearchCV`). If one has prior knowledge about how the data is
+distributed along its features, such as data being axis-aligned, then one might use a DT.
+Other considerations are runtime and space complexity.
+
 .. topic:: References:
 
     .. [BRE] L. Breiman, J. Friedman, R. Olshen, and C. Stone. Classification
       and Regression Trees. Wadsworth, Belmont, CA, 1984.
-
+    
+    .. [RF] L. Breiman. Random Forests. Machine Learning 45, 5–32 (2001).
+      https://doi.org/10.1023/A:1010933404324.
+      
     * https://en.wikipedia.org/wiki/Decision_tree_learning
 
     * https://en.wikipedia.org/wiki/Predictive_analytics
diff --git a/examples/tree/plot_iris_dtc.py b/examples/tree/plot_iris_dtc.py
index 14f6506b5810f..0dcca718bc6f0 100644
--- a/examples/tree/plot_iris_dtc.py
+++ b/examples/tree/plot_iris_dtc.py
@@ -2,16 +2,12 @@
 =======================================================================
 Plot the decision surface of decision trees trained on the iris dataset
 =======================================================================
-
 Plot the decision surface of a decision tree trained on pairs
 of features of the iris dataset.
-
 See :ref:`decision tree <tree>` for more information on the estimator.
-
 For each pair of iris features, the decision tree learns decision
 boundaries made of combinations of simple thresholding rules inferred from
 the training samples.
-
 We also show the tree structure of a model built on all of the features.
 """
 # %%
diff --git a/setup.py b/setup.py
old mode 100755
new mode 100644
index f5522600f623f..e39e39455b7bc
--- a/setup.py
+++ b/setup.py
@@ -30,19 +30,19 @@
 builtins.__SKLEARN_SETUP__ = True
 
 
-DISTNAME = "scikit-learn"
-DESCRIPTION = "A set of python modules for machine learning and data mining"
+DISTNAME = "scikit-learn-tree"
+DESCRIPTION = "A maintained fork of scikit-learn that extends the tree submodule."
 with open("README.rst") as f:
     LONG_DESCRIPTION = f.read()
-MAINTAINER = "Andreas Mueller"
-MAINTAINER_EMAIL = "amueller@ais.uni-bonn.de"
+MAINTAINER = "Adam Li"
+MAINTAINER_EMAIL = "adam.li@columbia.edu"
 URL = "http://scikit-learn.org"
-DOWNLOAD_URL = "https://pypi.org/project/scikit-learn/#files"
+DOWNLOAD_URL = "https://pypi.org/project/scikit-learn-tree/#files"
 LICENSE = "new BSD"
 PROJECT_URLS = {
-    "Bug Tracker": "https://github.com/scikit-learn/scikit-learn/issues",
+    "Bug Tracker": "https://github.com/neurodata/scikit-learn/issues",
     "Documentation": "https://scikit-learn.org/stable/documentation.html",
-    "Source Code": "https://github.com/scikit-learn/scikit-learn",
+    "Source Code": "https://github.com/neurodata/scikit-learn",
 }
 
 # We can actually import a restricted version of sklearn that
@@ -170,11 +170,11 @@ def check_package_status(package, min_version):
         package_status["up_to_date"] = False
         package_status["version"] = ""
 
-    req_str = "scikit-learn requires {} >= {}.\n".format(package, min_version)
+    req_str = "scikit-learn-tree requires {} >= {}.\n".format(package, min_version)
 
     instructions = (
         "Installation instructions are available on the "
-        "scikit-learn website: "
+        "scikit-learn-tree website: "
         "http://scikit-learn.org/stable/install.html\n"
     )
 
@@ -221,10 +221,10 @@ def check_package_status(package, min_version):
         {"sources": ["_cdnmf_fast.pyx"], "include_np": True},
     ],
     "ensemble": [
-        {"sources": ["_gradient_boosting.pyx"], "include_np": True},
+        {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True},
     ],
     "ensemble._hist_gradient_boosting": [
-        {"sources": ["_gradient_boosting.pyx"], "include_np": True},
+        {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True},
         {"sources": ["histogram.pyx"], "include_np": True},
         {"sources": ["splitting.pyx"], "include_np": True},
         {"sources": ["_binning.pyx"], "include_np": True},
@@ -306,7 +306,7 @@ def check_package_status(package, min_version):
         {"sources": ["_ball_tree.pyx"], "include_np": True},
         {"sources": ["_kd_tree.pyx"], "include_np": True},
         {"sources": ["_partition_nodes.pyx"], "language": "c++", "include_np": True},
-        {"sources": ["_quad_tree.pyx"], "include_np": True},
+        {"sources": ["_quad_tree.pyx"], "language": "c++", "include_np": True},
     ],
     "svm": [
         {
@@ -374,9 +374,24 @@ def check_package_status(package, min_version):
             "include_np": True,
             "optimization_level": "O3",
         },
-        {"sources": ["_splitter.pyx"], "include_np": True, "optimization_level": "O3"},
-        {"sources": ["_criterion.pyx"], "include_np": True, "optimization_level": "O3"},
-        {"sources": ["_utils.pyx"], "include_np": True, "optimization_level": "O3"},
+        {
+            "sources": ["_splitter.pyx"],
+            "include_np": True,
+            "language": "c++",
+            "optimization_level": "O3",
+        },
+        {
+            "sources": ["_criterion.pyx"],
+            "include_np": True,
+            "language": "c++",
+            "optimization_level": "O3",
+        },
+        {
+            "sources": ["_utils.pyx"],
+            "include_np": True,
+            "language": "c++",
+            "optimization_level": "O3",
+        },
     ],
     "utils": [
         {"sources": ["sparsefuncs_fast.pyx"], "include_np": True},
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 19203da4fce1f..a3c29e4a269ce 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -40,6 +40,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 # License: BSD 3 clause
 
 
+from time import time
 from numbers import Integral, Real
 from warnings import catch_warnings, simplefilter, warn
 import threading
@@ -72,10 +73,11 @@ class calls the ``fit`` method of each sub-estimator on random samples
     _check_sample_weight,
     _check_feature_names_in,
 )
+from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils.validation import _num_samples
 from ..utils._param_validation import Interval, StrOptions
 from ..utils._param_validation import RealNotInt
-
+from ._hist_gradient_boosting.binning import _BinMapper
 
 __all__ = [
     "RandomForestClassifier",
@@ -210,6 +212,10 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
             Interval(RealNotInt, 0.0, 1.0, closed="right"),
             Interval(Integral, 1, None, closed="left"),
         ],
+        "max_bins": [
+            None,
+            Interval(Integral, 1, None, closed="left"),
+        ],
     }
 
     @abstractmethod
@@ -228,6 +234,7 @@ def __init__(
         class_weight=None,
         max_samples=None,
         base_estimator="deprecated",
+        max_bins=None,
     ):
         super().__init__(
             estimator=estimator,
@@ -244,6 +251,7 @@ def __init__(
         self.warm_start = warm_start
         self.class_weight = class_weight
         self.max_samples = max_samples
+        self.max_bins = max_bins
 
     def apply(self, X):
         """
@@ -263,6 +271,15 @@ def apply(self, X):
             return the index of the leaf x ends up in.
         """
         X = self._validate_X_predict(X)
+
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
         results = Parallel(
             n_jobs=self.n_jobs,
             verbose=self.verbose,
@@ -420,6 +437,38 @@ def fit(self, X, y, sample_weight=None):
 
         n_more_estimators = self.n_estimators - len(self.estimators_)
 
+        if self.max_bins is not None:
+            # `_openmp_effective_n_threads` is used to take cgroups CPU quotes
+            # into account when determine the maximum number of threads to use.
+            n_threads = _openmp_effective_n_threads()
+
+            # Bin the data
+            # For ease of use of the API, the user-facing GBDT classes accept the
+            # parameter max_bins, which doesn't take into account the bin for
+            # missing values (which is always allocated). However, since max_bins
+            # isn't the true maximal number of bins, all other private classes
+            # (binmapper, histbuilder...) accept n_bins instead, which is the
+            # actual total number of bins. Everywhere in the code, the
+            # convention is that n_bins == max_bins + 1
+            n_bins = self.max_bins + 1  # + 1 for missing values
+            self._bin_mapper = _BinMapper(
+                n_bins=n_bins,
+                # is_categorical=self.is_categorical_,
+                known_categories=None,
+                random_state=random_state,
+                n_threads=n_threads,
+            )
+
+            # XXX: in order for this to work with the underlying tree submodule's Cython
+            # code, we need to convert this into the original data's DTYPE because
+            # the Cython code assumes that `DTYPE` is used.
+            # The proper implementation will be a lot more complicated and should be
+            # tackled once scikit-learn has finalized their inclusion of missing data
+            # and categorical support for decision trees
+            X = self._bin_data(X, is_training_data=True)  # .astype(DTYPE)
+        else:
+            self._bin_mapper = None
+
         if n_more_estimators < 0:
             raise ValueError(
                 "n_estimators=%d must be larger or equal to "
@@ -628,6 +677,35 @@ def feature_importances_(self):
         all_importances = np.mean(all_importances, axis=0, dtype=np.float64)
         return all_importances / np.sum(all_importances)
 
+    def _bin_data(self, X, is_training_data):
+        """Bin data X.
+
+        If is_training_data, then fit the _bin_mapper attribute.
+        Else, the binned data is converted to a C-contiguous array.
+        """
+
+        description = "training" if is_training_data else "validation"
+        if self.verbose:
+            print(
+                "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description),
+                end="",
+                flush=True,
+            )
+        tic = time()
+        if is_training_data:
+            X_binned = self._bin_mapper.fit_transform(X)  # F-aligned array
+        else:
+            X_binned = self._bin_mapper.transform(X)  # F-aligned array
+            # We convert the array to C-contiguous since predicting is faster
+            # with this layout (training is faster on F-arrays though)
+            X_binned = np.ascontiguousarray(X_binned)
+        toc = time()
+        if self.verbose:
+            duration = toc - tic
+            print("{:.3f} s".format(duration))
+
+        return X_binned
+
 
 def _accumulate_prediction(predict, X, out, lock):
     """
@@ -669,6 +747,7 @@ def __init__(
         class_weight=None,
         max_samples=None,
         base_estimator="deprecated",
+        max_bins=None,
     ):
         super().__init__(
             estimator=estimator,
@@ -683,6 +762,7 @@ def __init__(
             class_weight=class_weight,
             max_samples=max_samples,
             base_estimator=base_estimator,
+            max_bins=max_bins,
         )
 
     @staticmethod
@@ -856,6 +936,14 @@ def predict_proba(self, X):
         # Check data
         X = self._validate_X_predict(X)
 
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
         # Assign chunk of trees to jobs
         n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
 
@@ -937,6 +1025,7 @@ def __init__(
         warm_start=False,
         max_samples=None,
         base_estimator="deprecated",
+        max_bins=None,
     ):
         super().__init__(
             estimator,
@@ -950,6 +1039,7 @@ def __init__(
             warm_start=warm_start,
             max_samples=max_samples,
             base_estimator=base_estimator,
+            max_bins=max_bins,
         )
 
     def predict(self, X):
@@ -975,6 +1065,14 @@ def predict(self, X):
         # Check data
         X = self._validate_X_predict(X)
 
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
         # Assign chunk of trees to jobs
         n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
 
@@ -1399,6 +1497,7 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
     ):
         super().__init__(
             estimator=DecisionTreeClassifier(),
@@ -1423,6 +1522,7 @@ def __init__(
             warm_start=warm_start,
             class_weight=class_weight,
             max_samples=max_samples,
+            max_bins=max_bins,
         )
 
         self.criterion = criterion
@@ -1734,6 +1834,7 @@ def __init__(
         warm_start=False,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
     ):
         super().__init__(
             estimator=DecisionTreeRegressor(),
@@ -1757,6 +1858,7 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=max_samples,
+            max_bins=max_bins,
         )
 
         self.criterion = criterion
@@ -2084,6 +2186,7 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
     ):
         super().__init__(
             estimator=ExtraTreeClassifier(),
@@ -2108,6 +2211,7 @@ def __init__(
             warm_start=warm_start,
             class_weight=class_weight,
             max_samples=max_samples,
+            max_bins=max_bins,
         )
 
         self.criterion = criterion
@@ -2406,6 +2510,7 @@ def __init__(
         warm_start=False,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
     ):
         super().__init__(
             estimator=ExtraTreeRegressor(),
@@ -2429,6 +2534,7 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=max_samples,
+            max_bins=max_bins,
         )
 
         self.criterion = criterion
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index 9bf0bb2becd9b..0150340f24bc6 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -118,6 +118,120 @@
 FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS)
 
 
+def _sparse_parity(n, p=20, p_star=3, random_state=None):
+    """Generate sparse parity dataset.
+
+    Sparse parity is a multivariate generalization of the
+    XOR problem.
+
+    Parameters
+    ----------
+    n : int
+        Number of sample to generate.
+    p : int, optional
+        The dimensionality of the dataset, by default 20
+    p_star : int, optional
+        The number of informative dimensions, by default 3.
+    random_state : Random State, optional
+        Random state, by default None.
+
+    Returns
+    -------
+    X : np.ndarray of shape (n, p)
+        Sparse parity dataset as a dense array.
+    y : np.ndarray of shape (n,)
+        Labels of the dataset
+    """
+    rng = np.random.RandomState(seed=random_state)
+    X = rng.uniform(-1, 1, (n, p))
+    y = np.zeros(n)
+
+    for i in range(0, n):
+        y[i] = sum(X[i, :p_star] > 0) % 2
+
+    return X, y
+
+
+def _orthant(n, p=6, random_state=None):
+    """Generate orthant dataset.
+
+    Parameters
+    ----------
+    n : int
+        Number of sample to generate.
+    p : int, optional
+        The dimensionality of the dataset and the number of
+        unique labels, by default 6.
+    rec : int, optional
+        _description_, by default 1
+    random_state : Random State, optional
+        Random state, by default None.
+
+    Returns
+    -------
+    X : np.ndarray of shape (n, p)
+        Orthant dataset as a dense array.
+    y : np.ndarray of shape (n,)
+        Labels of the dataset
+    """
+    rng = np.random.RandomState(seed=random_state)
+    orth_labels = np.asarray([2**i for i in range(0, p)][::-1])
+
+    X = rng.uniform(-1, 1, (n, p))
+    y = np.zeros(n)
+
+    for i in range(0, n):
+        idx = np.where(X[i, :] > 0)[0]
+        y[i] = sum(orth_labels[idx])
+
+    if len(np.unique(y)) < 2**p:
+        raise RuntimeError("Increase sample size to get a label in each orthant.")
+
+    return X, y
+
+
+def _trunk(n, p=10, random_state=None):
+    """Generate trunk dataset.
+
+    Parameters
+    ----------
+    n : int
+        Number of sample to generate.
+    p : int, optional
+        The dimensionality of the dataset and the number of
+        unique labels, by default 10.
+    random_state : Random State, optional
+        Random state, by default None.
+
+    Returns
+    -------
+    X : np.ndarray of shape (n, p)
+        Trunk dataset as a dense array.
+    y : np.ndarray of shape (n,)
+        Labels of the dataset
+
+    References
+    ----------
+    [1] Gerard V. Trunk. A problem of dimensionality: A
+    simple example. IEEE Transactions on Pattern Analysis
+    and Machine Intelligence, 1(3):306–307, 1979.
+    """
+    rng = np.random.RandomState(seed=random_state)
+
+    mu_1 = np.array([1 / i for i in range(1, p + 1)])
+    mu_0 = -1 * mu_1
+    cov = np.identity(p)
+
+    X = np.vstack(
+        (
+            rng.multivariate_normal(mu_0, cov, int(n / 2)),
+            rng.multivariate_normal(mu_1, cov, int(n / 2)),
+        )
+    )
+    y = np.concatenate((np.zeros(int(n / 2)), np.ones(int(n / 2))))
+    return X, y
+
+
 def check_classification_toy(name):
     """Check classification on a toy dataset."""
     ForestClassifier = FOREST_CLASSIFIERS[name]
@@ -1791,3 +1905,60 @@ def test_round_samples_to_one_when_samples_too_low(class_weight):
         n_estimators=10, max_samples=1e-4, class_weight=class_weight, random_state=0
     )
     forest.fit(X, y)
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS)
+def test_classification_toy_withbins(name):
+    """Check classification on a toy dataset."""
+    ForestClassifier = FOREST_CLASSIFIERS[name]
+
+    clf = ForestClassifier(n_estimators=10, random_state=1, max_bins=255)
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    clf = ForestClassifier(
+        n_estimators=10, max_features=1, random_state=1, max_bins=255
+    )
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    # also test apply
+    leaf_indices = clf.apply(X)
+    assert leaf_indices.shape == (len(X), clf.n_estimators)
+
+
+@pytest.mark.parametrize("name", FOREST_REGRESSORS)
+@pytest.mark.parametrize(
+    "criterion", ("squared_error", "absolute_error", "friedman_mse")
+)
+def test_regression_criterion_withbins(name, criterion):
+    # Check consistency on regression dataset.
+    ForestRegressor = FOREST_REGRESSORS[name]
+
+    reg = ForestRegressor(
+        n_estimators=5, criterion=criterion, random_state=1, max_bins=250
+    )
+    reg.fit(X_reg, y_reg)
+    score = reg.score(X_reg, y_reg)
+    assert (
+        score > 0.93
+    ), "Failed with max_features=None, criterion %s and score = %f" % (
+        criterion,
+        score,
+    )
+
+    reg = ForestRegressor(
+        n_estimators=5,
+        criterion=criterion,
+        max_features=6,
+        random_state=1,
+        max_bins=250,
+    )
+    reg.fit(X_reg, y_reg)
+    score = reg.score(X_reg, y_reg)
+    assert score > 0.92, "Failed with max_features=6, criterion %s and score = %f" % (
+        criterion,
+        score,
+    )
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index b175275ea92dc..bd54483bf2dfe 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -40,8 +40,8 @@
 from ..utils._param_validation import Hidden, Interval, StrOptions
 from ..utils._param_validation import RealNotInt
 
-from ._criterion import Criterion
-from ._splitter import Splitter
+from ._criterion import BaseCriterion
+from ._splitter import BaseSplitter
 from ._tree import DepthFirstTreeBuilder
 from ._tree import BestFirstTreeBuilder
 from ._tree import Tree
@@ -174,7 +174,7 @@ def get_n_leaves(self):
         check_is_fitted(self)
         return self.tree_.n_leaves
 
-    def fit(self, X, y, sample_weight=None, check_input=True):
+    def fit(self, X, y=None, sample_weight=None, check_input=True):
         self._validate_params()
         random_state = check_random_state(self.random_state)
 
@@ -184,9 +184,12 @@ def fit(self, X, y, sample_weight=None, check_input=True):
             # csr.
             check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
             check_y_params = dict(ensure_2d=False, dtype=None)
-            X, y = self._validate_data(
-                X, y, validate_separately=(check_X_params, check_y_params)
-            )
+            if y is not None or self._get_tags()["requires_y"]:
+                X, y = self._validate_data(
+                    X, y, validate_separately=(check_X_params, check_y_params)
+                )
+            else:
+                X = self._validate_data(X, **check_X_params)
             if issparse(X):
                 X.sort_indices()
 
@@ -195,7 +198,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
                         "No support for np.int64 index based sparse matrices"
                     )
 
-            if self.criterion == "poisson":
+            if y is not None and self.criterion == "poisson":
                 if np.any(y < 0):
                     raise ValueError(
                         "Some value(s) of y are negative which is"
@@ -209,45 +212,56 @@ def fit(self, X, y, sample_weight=None, check_input=True):
 
         # Determine output settings
         n_samples, self.n_features_in_ = X.shape
-        is_classification = is_classifier(self)
 
-        y = np.atleast_1d(y)
-        expanded_class_weight = None
+        # Do preprocessing if 'y' is passed
+        is_classification = False
+        if y is not None:
+            is_classification = is_classifier(self)
+
+            y = np.atleast_1d(y)
+            expanded_class_weight = None
 
-        if y.ndim == 1:
-            # reshape is necessary to preserve the data contiguity against vs
-            # [:, np.newaxis] that does not.
-            y = np.reshape(y, (-1, 1))
+            if y.ndim == 1:
+                # reshape is necessary to preserve the data contiguity against vs
+                # [:, np.newaxis] that does not.
+                y = np.reshape(y, (-1, 1))
 
-        self.n_outputs_ = y.shape[1]
+            self.n_outputs_ = y.shape[1]
 
-        if is_classification:
-            check_classification_targets(y)
-            y = np.copy(y)
+            if is_classification:
+                check_classification_targets(y)
+                y = np.copy(y)
 
-            self.classes_ = []
-            self.n_classes_ = []
+                self.classes_ = []
+                self.n_classes_ = []
 
-            if self.class_weight is not None:
-                y_original = np.copy(y)
+                if self.class_weight is not None:
+                    y_original = np.copy(y)
 
-            y_encoded = np.zeros(y.shape, dtype=int)
-            for k in range(self.n_outputs_):
-                classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True)
-                self.classes_.append(classes_k)
-                self.n_classes_.append(classes_k.shape[0])
-            y = y_encoded
-
-            if self.class_weight is not None:
-                expanded_class_weight = compute_sample_weight(
-                    self.class_weight, y_original
-                )
+                y_encoded = np.zeros(y.shape, dtype=int)
+                for k in range(self.n_outputs_):
+                    classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True)
+                    self.classes_.append(classes_k)
+                    self.n_classes_.append(classes_k.shape[0])
+                y = y_encoded
+
+                if self.class_weight is not None:
+                    expanded_class_weight = compute_sample_weight(
+                        self.class_weight, y_original
+                    )
 
-            self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
+                self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
 
-        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
-            y = np.ascontiguousarray(y, dtype=DOUBLE)
+            if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
+                y = np.ascontiguousarray(y, dtype=DOUBLE)
 
+            if len(y) != n_samples:
+                raise ValueError(
+                    "Number of labels=%d does not match number of samples=%d"
+                    % (len(y), n_samples)
+                )
+
+        # set decision-tree model parameters
         max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth
 
         if isinstance(self.min_samples_leaf, numbers.Integral):
@@ -299,16 +313,10 @@ def fit(self, X, y, sample_weight=None, check_input=True):
 
         max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes
 
-        if len(y) != n_samples:
-            raise ValueError(
-                "Number of labels=%d does not match number of samples=%d"
-                % (len(y), n_samples)
-            )
-
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X, DOUBLE)
 
-        if expanded_class_weight is not None:
+        if y is not None and expanded_class_weight is not None:
             if sample_weight is not None:
                 sample_weight = sample_weight * expanded_class_weight
             else:
@@ -320,10 +328,63 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         else:
             min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight)
 
+        # build the actual tree now with the parameters
+        self._build_tree(
+            X,
+            y,
+            sample_weight,
+            min_samples_leaf,
+            min_weight_leaf,
+            max_leaf_nodes,
+            min_samples_split,
+            max_depth,
+            random_state,
+        )
+
+        return self
+
+    def _build_tree(
+        self,
+        X,
+        y,
+        sample_weight,
+        min_samples_leaf,
+        min_weight_leaf,
+        max_leaf_nodes,
+        min_samples_split,
+        max_depth,
+        random_state,
+    ):
+        """Build the actual tree.
+
+        Parameters
+        ----------
+        X : Array-like
+            X dataset.
+        y : Array-like
+            Y targets.
+        sample_weight : Array-like
+            Sample weights
+        min_samples_leaf : float
+            Number of samples required to be a leaf.
+        min_weight_leaf : float
+            Weight of samples required to be a leaf.
+        max_leaf_nodes : float
+            Maximum number of leaf nodes allowed in tree.
+        min_samples_split : float
+            Minimum number of samples to split on.
+        max_depth : int
+            The maximum depth of any tree.
+        random_state : int
+            Random seed.
+        """
+
+        n_samples = X.shape[0]
+
         # Build tree
         criterion = self.criterion
-        if not isinstance(criterion, Criterion):
-            if is_classification:
+        if not isinstance(criterion, BaseCriterion):
+            if is_classifier(self):
                 criterion = CRITERIA_CLF[self.criterion](
                     self.n_outputs_, self.n_classes_
                 )
@@ -337,7 +398,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
 
         splitter = self.splitter
-        if not isinstance(self.splitter, Splitter):
+        if not isinstance(self.splitter, BaseSplitter):
             splitter = SPLITTERS[self.splitter](
                 criterion,
                 self.max_features_,
@@ -385,8 +446,6 @@ def fit(self, X, y, sample_weight=None, check_input=True):
 
         self._prune_tree()
 
-        return self
-
     def _validate_X_predict(self, X, check_input):
         """Validate the training data on predict (probabilities)."""
         if check_input:
@@ -817,7 +876,10 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
 
     _parameter_constraints: dict = {
         **BaseDecisionTree._parameter_constraints,
-        "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)],
+        "criterion": [
+            StrOptions({"gini", "entropy", "log_loss"}),
+            Hidden(BaseCriterion),
+        ],
         "class_weight": [dict, list, StrOptions({"balanced"}), None],
     }
 
@@ -1173,7 +1235,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
         **BaseDecisionTree._parameter_constraints,
         "criterion": [
             StrOptions({"squared_error", "friedman_mse", "absolute_error", "poisson"}),
-            Hidden(Criterion),
+            Hidden(BaseCriterion),
         ],
     }
 
diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index 47f616c6bad50..2e179e78e8c3f 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -4,6 +4,8 @@
 #          Joel Nothman <joel.nothman@gmail.com>
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Adam Li <adam2392@gmail.com>
+#          Jong Shin <jshinm@gmail.com>
 #
 # License: BSD 3 clause
 
@@ -15,13 +17,11 @@ from ._tree cimport SIZE_t           # Type for indices and counters
 from ._tree cimport INT32_t          # Signed 32 bit integer
 from ._tree cimport UINT32_t         # Unsigned 32 bit integer
 
-cdef class Criterion:
-    # The criterion computes the impurity of a node and the reduction of
-    # impurity of a split on that node. It also computes the output statistics
-    # such as the mean in regression and class probabilities in classification.
+
+cdef class BaseCriterion:
+    """Abstract interface for criterion."""    
 
     # Internal structures
-    cdef const DOUBLE_t[:, ::1] y        # Values of y
     cdef const DOUBLE_t[:] sample_weight # Sample weights
 
     cdef const SIZE_t[:] sample_indices  # Sample indices in X, y
@@ -37,19 +37,7 @@ cdef class Criterion:
     cdef double weighted_n_left          # Weighted number of samples in the left node
     cdef double weighted_n_right         # Weighted number of samples in the right node
 
-    # The criterion object is maintained such that left and right collected
-    # statistics correspond to samples[start:pos] and samples[pos:end].
-
-    # Methods
-    cdef int init(
-        self,
-        const DOUBLE_t[:, ::1] y,
-        const DOUBLE_t[:] sample_weight,
-        double weighted_n_samples,
-        const SIZE_t[:] sample_indices,
-        SIZE_t start,
-        SIZE_t end
-    ) except -1 nogil
+    # Core methods that criterion class _must_ implement.
     cdef int reset(self) except -1 nogil
     cdef int reverse_reset(self) except -1 nogil
     cdef int update(self, SIZE_t new_pos) except -1 nogil
@@ -71,6 +59,25 @@ cdef class Criterion:
     ) noexcept nogil
     cdef double proxy_impurity_improvement(self) noexcept nogil
 
+    cdef void set_sample_pointers(
+        self,
+        SIZE_t start,
+        SIZE_t end
+    ) noexcept nogil
+
+cdef class Criterion(BaseCriterion):
+    """Abstract interface for supervised impurity criteria."""
+
+    cdef const DOUBLE_t[:, ::1] y
+
+    cdef int init(
+        self,
+        const DOUBLE_t[:, ::1] y,
+        const DOUBLE_t[:] sample_weight,
+        double weighted_n_samples,
+        const SIZE_t[:] sample_indices
+    ) except -1 nogil
+
 cdef class ClassificationCriterion(Criterion):
     """Abstract criterion for classification."""
 
@@ -88,4 +95,4 @@ cdef class RegressionCriterion(Criterion):
 
     cdef double[::1] sum_total   # The sum of w*y.
     cdef double[::1] sum_left    # Same as above, but for the left side of the split
-    cdef double[::1] sum_right   # Same as above, but for the right side of the split
+    cdef double[::1] sum_right   # Same as above, but for the right side of the split
\ No newline at end of file
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 7cd7bbb0e3c1b..c94914daa0e0b 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -9,6 +9,8 @@
 #          Fares Hedayati <fares.hedayati@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
 #          Nelson Liu <nelson@nelsonliu.me>
+#          Adam Li <adam2392@gmail.com>
+#          Jong Shin <jshinm@gmail.com>
 #
 # License: BSD 3 clause
 
@@ -29,11 +31,20 @@ from ._utils cimport WeightedMedianCalculator
 # EPSILON is used in the Poisson criterion
 cdef double EPSILON = 10 * np.finfo('double').eps
 
-cdef class Criterion:
-    """Interface for impurity criteria.
-
+cdef class BaseCriterion:
+    """This is an abstract interface for criterion. For example, a tree model could
+    be either supervisedly, or unsupervisedly computing impurity on samples of
+    covariates, or labels, or both. Although scikit-learn currently only contains
+    supervised tree methods, this class enables 3rd party packages to leverage
+    scikit-learn's Cython code for criteria.
+    The downstream classes _must_ implement methods to compute the impurity
+    in current node and in children nodes.
     This object stores methods on how to calculate how good a split is using
-    different metrics.
+    a set API. 
+    Samples in the "current" node are stored in `samples[start:end]` which is
+    partitioned around `pos` (an index in `start:end`) so that:
+       - the samples of left child node are stored in `samples[start:pos]`
+       - the samples of right child node are stored in `samples[pos:end]`
     """
     def __getstate__(self):
         return {}
@@ -41,61 +52,23 @@ cdef class Criterion:
     def __setstate__(self, d):
         pass
 
-    cdef int init(
-        self,
-        const DOUBLE_t[:, ::1] y,
-        const DOUBLE_t[:] sample_weight,
-        double weighted_n_samples,
-        const SIZE_t[:] sample_indices,
-        SIZE_t start,
-        SIZE_t end,
-    ) except -1 nogil:
-        """Placeholder for a method which will initialize the criterion.
-
-        Returns -1 in case of failure to allocate memory (and raise MemoryError)
-        or 0 otherwise.
-
-        Parameters
-        ----------
-        y : ndarray, dtype=DOUBLE_t
-            y is a buffer that can store values for n_outputs target variables
-            stored as a Cython memoryview.
-        sample_weight : ndarray, dtype=DOUBLE_t
-            The weight of each sample stored as a Cython memoryview.
-        weighted_n_samples : double
-            The total weight of the samples being considered
-        sample_indices : ndarray, dtype=SIZE_t
-            A mask on the samples. Indices of the samples in X and y we want to use,
-            where sample_indices[start:end] correspond to the samples in this node.
-        start : SIZE_t
-            The first sample to be used on this node
-        end : SIZE_t
-            The last sample used on this node
-
-        """
-        pass
-
     cdef int reset(self) except -1 nogil:
         """Reset the criterion at pos=start.
-
         This method must be implemented by the subclass.
         """
         pass
 
     cdef int reverse_reset(self) except -1 nogil:
         """Reset the criterion at pos=end.
-
         This method must be implemented by the subclass.
         """
         pass
 
     cdef int update(self, SIZE_t new_pos) except -1 nogil:
         """Updated statistics by moving sample_indices[pos:new_pos] to the left child.
-
         This updates the collected statistics by moving sample_indices[pos:new_pos]
         from the right child to the left child. It must be implemented by
         the subclass.
-
         Parameters
         ----------
         new_pos : SIZE_t
@@ -105,7 +78,6 @@ cdef class Criterion:
 
     cdef double node_impurity(self) noexcept nogil:
         """Placeholder for calculating the impurity of the node.
-
         Placeholder for a method which will evaluate the impurity of
         the current node, i.e. the impurity of sample_indices[start:end]. This is the
         primary function of the criterion class. The smaller the impurity the
@@ -116,11 +88,9 @@ cdef class Criterion:
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Placeholder for calculating the impurity of children.
-
         Placeholder for a method which evaluates the impurity in
         children nodes, i.e. the impurity of sample_indices[start:pos] + the impurity
         of sample_indices[pos:end].
-
         Parameters
         ----------
         impurity_left : double pointer
@@ -134,10 +104,8 @@ cdef class Criterion:
 
     cdef void node_value(self, double* dest) noexcept nogil:
         """Placeholder for storing the node value.
-
         Placeholder for a method which will compute the node value
         of sample_indices[start:end] and save the value into dest.
-
         Parameters
         ----------
         dest : double pointer
@@ -147,12 +115,10 @@ cdef class Criterion:
 
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
-
         This method is used to speed up the search for the best split.
         It is a proxy quantity such that the split that maximizes this value
         also maximizes the impurity improvement. It neglects all constant terms
         of the impurity decrease for a given split.
-
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
         """
@@ -167,28 +133,21 @@ cdef class Criterion:
                                      double impurity_left,
                                      double impurity_right) noexcept nogil:
         """Compute the improvement in impurity.
-
         This method computes the improvement in impurity when a split occurs.
         The weighted impurity improvement equation is the following:
-
             N_t / N * (impurity - N_t_R / N_t * right_impurity
                                 - N_t_L / N_t * left_impurity)
-
         where N is the total number of samples, N_t is the number of samples
         at the current node, N_t_L is the number of samples in the left child,
         and N_t_R is the number of samples in the right child,
-
         Parameters
         ----------
         impurity_parent : double
             The initial impurity of the parent node before the split
-
         impurity_left : double
             The impurity of the left child
-
         impurity_right : double
             The impurity of the right child
-
         Return
         ------
         double : improvement in impurity after the split occurs
@@ -199,6 +158,61 @@ cdef class Criterion:
                                  - (self.weighted_n_left /
                                     self.weighted_n_node_samples * impurity_left)))
 
+    cdef void set_sample_pointers(
+        self,
+        SIZE_t start,
+        SIZE_t end
+    ) noexcept nogil:
+        """Abstract method which will set sample pointers in the criterion.
+        The dataset array that we compute criteria on is assumed to consist of 'N' 
+        ordered samples or rows (i.e. sorted). Since we pass this by reference, we 
+        use sample pointers to move the start and end around to consider only a subset of data. 
+        This function should also update relevant statistics that the class uses to compute the final criterion.
+        Parameters
+        ----------
+        start : SIZE_t
+            The index of the first sample to be used on computation of criteria of the current node.
+        end : SIZE_t
+            The last sample used on this node
+        """
+        pass
+
+
+cdef class Criterion(BaseCriterion):
+    """Interface for impurity criteria.
+    The supervised criterion computes the impurity of a node and the reduction of
+    impurity of a split on that node using the distribution of labels in parent and
+    children nodes. It also computes the output statistics
+    such as the mean in regression and class probabilities in classification.
+    Instances of this class are responsible for compute splits' impurity difference
+    Criterion is the base class for criteria used in supervised tree-based models
+    with a homogeneous float64-dtyped y.
+    """
+    cdef int init(
+        self,
+        const DOUBLE_t[:, ::1] y,
+        const DOUBLE_t[:] sample_weight,
+        double weighted_n_samples,
+        const SIZE_t[:] sample_indices
+    ) except -1 nogil:
+        """Placeholder for a method which will initialize the criterion.
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        Parameters
+        ----------
+        y : ndarray, dtype=DOUBLE_t
+            y is a buffer that can store values for n_outputs target variables
+            stored as a Cython memoryview.
+        sample_weight : ndarray, dtype=DOUBLE_t
+            The weight of each sample stored as a Cython memoryview.
+        weighted_n_samples : double
+            The total weight of the samples being considered
+        sample_indices : ndarray, dtype=SIZE_t
+            A mask on the samples. Indices of the samples in X and y we want to use,
+            where sample_indices[start:end] correspond to the samples in this node.
+        """
+        pass
+
 
 cdef class ClassificationCriterion(Criterion):
     """Abstract criterion for classification."""
@@ -206,7 +220,6 @@ cdef class ClassificationCriterion(Criterion):
     def __cinit__(self, SIZE_t n_outputs,
                   cnp.ndarray[SIZE_t, ndim=1] n_classes):
         """Initialize attributes for this criterion.
-
         Parameters
         ----------
         n_outputs : SIZE_t
@@ -254,18 +267,11 @@ cdef class ClassificationCriterion(Criterion):
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        const SIZE_t[:] sample_indices,
-        SIZE_t start,
-        SIZE_t end
+        const SIZE_t[:] sample_indices
     ) except -1 nogil:
         """Initialize the criterion.
-
-        This initializes the criterion at node sample_indices[start:end] and children
-        sample_indices[start:start] and sample_indices[start:end].
-
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
-
         Parameters
         ----------
         y : ndarray, dtype=DOUBLE_t
@@ -277,18 +283,24 @@ cdef class ClassificationCriterion(Criterion):
         sample_indices : ndarray, dtype=SIZE_t
             A mask on the samples. Indices of the samples in X and y we want to use,
             where sample_indices[start:end] correspond to the samples in this node.
-        start : SIZE_t
-            The first sample to use in the mask
-        end : SIZE_t
-            The last sample to use in the mask
         """
         self.y = y
         self.sample_weight = sample_weight
         self.sample_indices = sample_indices
+        self.weighted_n_samples = weighted_n_samples
+
+        return 0
+
+    cdef void set_sample_pointers(
+        self,
+        SIZE_t start,
+        SIZE_t end
+    ) noexcept nogil:
+        """Set sample pointers in the criterion."""
+        self.n_node_samples = end - start
         self.start = start
         self.end = end
-        self.n_node_samples = end - start
-        self.weighted_n_samples = weighted_n_samples
+
         self.weighted_n_node_samples = 0.0
 
         cdef SIZE_t i
@@ -301,12 +313,12 @@ cdef class ClassificationCriterion(Criterion):
             memset(&self.sum_total[k, 0], 0, self.n_classes[k] * sizeof(double))
 
         for p in range(start, end):
-            i = sample_indices[p]
+            i = self.sample_indices[p]
 
             # w is originally set to be 1.0, meaning that if no sample weights
             # are given, the default weight of each sample is 1.0.
-            if sample_weight is not None:
-                w = sample_weight[i]
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
 
             # Count weighted class frequency for each target
             for k in range(self.n_outputs):
@@ -317,11 +329,9 @@ cdef class ClassificationCriterion(Criterion):
 
         # Reset to pos=start
         self.reset()
-        return 0
 
     cdef int reset(self) except -1 nogil:
         """Reset the criterion at pos=start.
-
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -338,7 +348,6 @@ cdef class ClassificationCriterion(Criterion):
 
     cdef int reverse_reset(self) except -1 nogil:
         """Reset the criterion at pos=end.
-
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -355,10 +364,8 @@ cdef class ClassificationCriterion(Criterion):
 
     cdef int update(self, SIZE_t new_pos) except -1 nogil:
         """Updated statistics by moving sample_indices[pos:new_pos] to the left child.
-
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
-
         Parameters
         ----------
         new_pos : SIZE_t
@@ -428,7 +435,6 @@ cdef class ClassificationCriterion(Criterion):
 
     cdef void node_value(self, double* dest) noexcept nogil:
         """Compute the node value of sample_indices[start:end] and save it into dest.
-
         Parameters
         ----------
         dest : double pointer
@@ -443,23 +449,17 @@ cdef class ClassificationCriterion(Criterion):
 
 cdef class Entropy(ClassificationCriterion):
     r"""Cross Entropy impurity criterion.
-
     This handles cases where the target is a classification taking values
     0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations,
     then let
-
         count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k)
-
     be the proportion of class k observations in node m.
-
     The cross-entropy is then defined as
-
         cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k)
     """
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
-
         Evaluate the cross-entropy criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -481,10 +481,8 @@ cdef class Entropy(ClassificationCriterion):
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
-
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity the right child (sample_indices[pos:end]).
-
         Parameters
         ----------
         impurity_left : double pointer
@@ -516,24 +514,18 @@ cdef class Entropy(ClassificationCriterion):
 
 cdef class Gini(ClassificationCriterion):
     r"""Gini Index impurity criterion.
-
     This handles cases where the target is a classification taking values
     0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations,
     then let
-
         count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k)
-
     be the proportion of class k observations in node m.
-
     The Gini Index is then defined as:
-
         index = \sum_{k=0}^{K-1} count_k (1 - count_k)
               = 1 - \sum_{k=0}^{K-1} count_k ** 2
     """
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
-
         Evaluate the Gini criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -559,10 +551,8 @@ cdef class Gini(ClassificationCriterion):
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
-
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity the right child (sample_indices[pos:end]) using the Gini index.
-
         Parameters
         ----------
         impurity_left : double pointer
@@ -601,24 +591,20 @@ cdef class Gini(ClassificationCriterion):
 
 cdef class RegressionCriterion(Criterion):
     r"""Abstract regression criterion.
-
     This handles cases where the target is a continuous value, and is
     evaluated by computing the variance of the target values left and right
     of the split point. The computation takes linear time with `n_samples`
     by using ::
-
         var = \sum_i^n (y_i - y_bar) ** 2
             = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2
     """
 
     def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples):
         """Initialize parameters for this criterion.
-
         Parameters
         ----------
         n_outputs : SIZE_t
             The number of targets to be predicted
-
         n_samples : SIZE_t
             The total number of samples to fit on
         """
@@ -648,23 +634,29 @@ cdef class RegressionCriterion(Criterion):
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        const SIZE_t[:] sample_indices,
-        SIZE_t start,
-        SIZE_t end,
+        const SIZE_t[:] sample_indices
     ) except -1 nogil:
-        """Initialize the criterion.
-
-        This initializes the criterion at node sample_indices[start:end] and children
-        sample_indices[start:start] and sample_indices[start:end].
-        """
+        """Initialize the criterion."""
         # Initialize fields
         self.y = y
         self.sample_weight = sample_weight
         self.sample_indices = sample_indices
+        self.weighted_n_samples = weighted_n_samples
+
+        return 0
+
+    cdef void set_sample_pointers(
+        self,
+        SIZE_t start,
+        SIZE_t end
+    ) noexcept nogil:
+        """Set sample pointers in the criterion."""
         self.start = start
         self.end = end
+
         self.n_node_samples = end - start
-        self.weighted_n_samples = weighted_n_samples
+
+        self.sq_sum_total = 0.0
         self.weighted_n_node_samples = 0.
 
         cdef SIZE_t i
@@ -673,14 +665,14 @@ cdef class RegressionCriterion(Criterion):
         cdef DOUBLE_t y_ik
         cdef DOUBLE_t w_y_ik
         cdef DOUBLE_t w = 1.0
-        self.sq_sum_total = 0.0
+
         memset(&self.sum_total[0], 0, self.n_outputs * sizeof(double))
 
         for p in range(start, end):
-            i = sample_indices[p]
+            i = self.sample_indices[p]
 
-            if sample_weight is not None:
-                w = sample_weight[i]
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
 
             for k in range(self.n_outputs):
                 y_ik = self.y[i, k]
@@ -692,7 +684,6 @@ cdef class RegressionCriterion(Criterion):
 
         # Reset to pos=start
         self.reset()
-        return 0
 
     cdef int reset(self) except -1 nogil:
         """Reset the criterion at pos=start."""
@@ -785,13 +776,11 @@ cdef class RegressionCriterion(Criterion):
 
 cdef class MSE(RegressionCriterion):
     """Mean squared error impurity criterion.
-
         MSE = var_left + var_right
     """
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
-
         Evaluate the MSE criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -807,22 +796,16 @@ cdef class MSE(RegressionCriterion):
 
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
-
         This method is used to speed up the search for the best split.
         It is a proxy quantity such that the split that maximizes this value
         also maximizes the impurity improvement. It neglects all constant terms
         of the impurity decrease for a given split.
-
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
-
         The MSE proxy is derived from
-
             sum_{i left}(y_i - y_pred_L)^2 + sum_{i right}(y_i - y_pred_R)^2
             = sum(y_i^2) - n_L * mean_{i left}(y_i)^2 - n_R * mean_{i right}(y_i)^2
-
         Neglecting constant terms, this gives:
-
             - 1/n_L * sum_{i left}(y_i)^2 - 1/n_R * sum_{i right}(y_i)^2
         """
         cdef SIZE_t k
@@ -839,7 +822,6 @@ cdef class MSE(RegressionCriterion):
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
-
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity the right child (sample_indices[pos:end]).
         """
@@ -883,7 +865,6 @@ cdef class MSE(RegressionCriterion):
 
 cdef class MAE(RegressionCriterion):
     r"""Mean absolute error impurity criterion.
-
        MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true
        value and f_i is the predicted value."""
 
@@ -895,12 +876,10 @@ cdef class MAE(RegressionCriterion):
 
     def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples):
         """Initialize parameters for this criterion.
-
         Parameters
         ----------
         n_outputs : SIZE_t
             The number of targets to be predicted
-
         n_samples : SIZE_t
             The total number of samples to fit on
         """
@@ -933,26 +912,30 @@ cdef class MAE(RegressionCriterion):
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        const SIZE_t[:] sample_indices,
-        SIZE_t start,
-        SIZE_t end,
+        const SIZE_t[:] sample_indices
     ) except -1 nogil:
-        """Initialize the criterion.
-
-        This initializes the criterion at node sample_indices[start:end] and children
-        sample_indices[start:start] and sample_indices[start:end].
-        """
-        cdef SIZE_t i, p, k
-        cdef DOUBLE_t w = 1.0
-
+        """Initialize the criterion."""
         # Initialize fields
         self.y = y
         self.sample_weight = sample_weight
         self.sample_indices = sample_indices
+        self.weighted_n_samples = weighted_n_samples
+
+        return 0
+
+    cdef void set_sample_pointers(
+        self,
+        SIZE_t start,
+        SIZE_t end
+    ) noexcept nogil:
+        """Set sample pointers in the criterion."""
+        cdef SIZE_t i, p, k
+        cdef DOUBLE_t w = 1.0
+
         self.start = start
         self.end = end
+
         self.n_node_samples = end - start
-        self.weighted_n_samples = weighted_n_samples
         self.weighted_n_node_samples = 0.
 
         cdef void** left_child = self.left_child_ptr
@@ -963,10 +946,10 @@ cdef class MAE(RegressionCriterion):
             (<WeightedMedianCalculator> right_child[k]).reset()
 
         for p in range(start, end):
-            i = sample_indices[p]
+            i = self.sample_indices[p]
 
-            if sample_weight is not None:
-                w = sample_weight[i]
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
 
             for k in range(self.n_outputs):
                 # push method ends up calling safe_realloc, hence `except -1`
@@ -981,11 +964,9 @@ cdef class MAE(RegressionCriterion):
 
         # Reset to pos=start
         self.reset()
-        return 0
 
     cdef int reset(self) except -1 nogil:
         """Reset the criterion at pos=start.
-
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -1016,7 +997,6 @@ cdef class MAE(RegressionCriterion):
 
     cdef int reverse_reset(self) except -1 nogil:
         """Reset the criterion at pos=end.
-
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -1044,7 +1024,6 @@ cdef class MAE(RegressionCriterion):
 
     cdef int update(self, SIZE_t new_pos) except -1 nogil:
         """Updated statistics by moving sample_indices[pos:new_pos] to the left.
-
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -1107,7 +1086,6 @@ cdef class MAE(RegressionCriterion):
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
-
         Evaluate the MAE criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -1132,7 +1110,6 @@ cdef class MAE(RegressionCriterion):
     cdef void children_impurity(self, double* p_impurity_left,
                                 double* p_impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
-
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity the right child (sample_indices[pos:end]).
         """
@@ -1179,21 +1156,17 @@ cdef class MAE(RegressionCriterion):
 
 cdef class FriedmanMSE(MSE):
     """Mean squared error impurity criterion with improvement score by Friedman.
-
     Uses the formula (35) in Friedman's original Gradient Boosting paper:
-
         diff = mean_left - mean_right
         improvement = n_left * n_right * diff^2 / (n_left + n_right)
     """
 
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
-
         This method is used to speed up the search for the best split.
         It is a proxy quantity such that the split that maximizes this value
         also maximizes the impurity improvement. It neglects all constant terms
         of the impurity decrease for a given split.
-
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
         """
@@ -1234,9 +1207,7 @@ cdef class FriedmanMSE(MSE):
 
 cdef class Poisson(RegressionCriterion):
     """Half Poisson deviance as impurity criterion.
-
     Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true)
-
     Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)`
     at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the
     implemented impurity (factor 2 is skipped):
@@ -1255,7 +1226,6 @@ cdef class Poisson(RegressionCriterion):
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
-
         Evaluate the Poisson criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -1265,24 +1235,18 @@ cdef class Poisson(RegressionCriterion):
 
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
-
         This method is used to speed up the search for the best split.
         It is a proxy quantity such that the split that maximizes this value
         also maximizes the impurity improvement. It neglects all constant terms
         of the impurity decrease for a given split.
-
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
-
         The Poisson proxy is derived from:
-
               sum_{i left }(y_i * log(y_i / y_pred_L))
             + sum_{i right}(y_i * log(y_i / y_pred_R))
             = sum(y_i * log(y_i) - n_L * mean_{i left}(y_i) * log(mean_{i left}(y_i))
                                  - n_R * mean_{i right}(y_i) * log(mean_{i right}(y_i))
-
         Neglecting constant terms, this gives
-
             - sum{i left }(y_i) * log(mean{i left}(y_i))
             - sum{i right}(y_i) * log(mean{i right}(y_i))
         """
@@ -1312,7 +1276,6 @@ cdef class Poisson(RegressionCriterion):
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
-
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity of the right child (sample_indices[pos:end]) for Poisson.
         """
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 13fec5974c3c5..b0207ab0a715d 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -4,12 +4,14 @@
 #          Joel Nothman <joel.nothman@gmail.com>
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Adam Li <adam2392@gmail.com>
+#          Jong Shin <jshinm@gmail.com>
 #
 # License: BSD 3 clause
 
 # See _splitter.pyx for details.
 
-from ._criterion cimport Criterion
+from ._criterion cimport BaseCriterion, Criterion
 
 from ._tree cimport DTYPE_t          # Type of X
 from ._tree cimport DOUBLE_t         # Type of y, sample_weight
@@ -28,14 +30,15 @@ cdef struct SplitRecord:
     double impurity_left   # Impurity of the left split.
     double impurity_right  # Impurity of the right split.
 
-cdef class Splitter:
+cdef class BaseSplitter:
+    """Abstract interface for splitter."""
+
     # The splitter searches in the input space for a feature and a threshold
     # to split the samples samples[start:end].
     #
     # The impurity computations are delegated to a criterion object.
 
     # Internal structures
-    cdef public Criterion criterion      # Impurity criterion
     cdef public SIZE_t max_features      # Number of features to test
     cdef public SIZE_t min_samples_leaf  # Min samples in a leaf
     cdef public double min_weight_leaf   # Minimum weight in a leaf
@@ -54,7 +57,6 @@ cdef class Splitter:
     cdef SIZE_t start                    # Start position for the current node
     cdef SIZE_t end                      # End position for the current node
 
-    cdef const DOUBLE_t[:, ::1] y
     cdef const DOUBLE_t[:] sample_weight
 
     # The samples vector `samples` is maintained by the Splitter object such
@@ -74,27 +76,38 @@ cdef class Splitter:
     # This allows optimization with depth-based tree building.
 
     # Methods
-    cdef int init(
-        self,
-        object X,
-        const DOUBLE_t[:, ::1] y,
-        const DOUBLE_t[:] sample_weight
-    ) except -1
-
     cdef int node_reset(
         self,
         SIZE_t start,
         SIZE_t end,
         double* weighted_n_node_samples
     ) except -1 nogil
-
     cdef int node_split(
         self,
         double impurity,   # Impurity of the node
         SplitRecord* split,
         SIZE_t* n_constant_features
     ) except -1 nogil
-
     cdef void node_value(self, double* dest) noexcept nogil
-
     cdef double node_impurity(self) noexcept nogil
+    cdef int pointer_size(self) noexcept nogil
+
+cdef class Splitter(BaseSplitter):
+    cdef public Criterion criterion      # Impurity criterion
+    cdef const DOUBLE_t[:, ::1] y
+    
+    cdef int init(
+        self,
+        object X,
+        const DOUBLE_t[:, ::1] y,
+        const DOUBLE_t[:] sample_weight
+    ) except -1
+
+    # Methods that allow modifications to stopping conditions
+    cdef bint check_presplit_conditions(
+        self,
+        SplitRecord current_split,
+    ) noexcept nogil
+    cdef bint check_postsplit_conditions(
+        self
+    ) noexcept nogil
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 83a80d90cc1b9..17a747433d1a8 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -8,7 +8,10 @@
 #          Joel Nothman <joel.nothman@gmail.com>
 #          Fares Hedayati <fares.hedayati@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Adam Li <adam2392@gmail.com>
+#          Jong Shin <jshinm@gmail.com>
 #
+
 # License: BSD 3 clause
 
 from ._criterion cimport Criterion
@@ -43,16 +46,78 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) noexcept nogil
     self.threshold = 0.
     self.improvement = -INFINITY
 
-cdef class Splitter:
-    """Abstract splitter class.
+cdef class BaseSplitter:
+    """This is an abstract interface for splitters. 
+
+    For example, a tree model could be either supervisedly, or unsupervisedly computing splits on samples of
+    covariates, labels, or both. Although scikit-learn currently only contains
+    supervised tree methods, this class enables 3rd party packages to leverage
+    scikit-learn's Cython code for splitting. 
+
+    A splitter is usually used in conjunction with a criterion class, which explicitly handles
+    computing the criteria, which we split on. The setting of that criterion class is handled
+    by downstream classes.
 
-    Splitters are called by tree builders to find the best splits on both
-    sparse and dense data, one split at a time.
+    The downstream classes _must_ implement methods to compute the split in a node.
     """
 
+    def __getstate__(self):
+        return {}
+
+    def __setstate__(self, d):
+        pass
+
+    cdef int node_reset(self, SIZE_t start, SIZE_t end,
+                        double* weighted_n_node_samples) except -1 nogil:
+        """Reset splitter on node samples[start:end].
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+
+        Parameters
+        ----------
+        start : SIZE_t
+            The index of the first sample to consider
+        end : SIZE_t
+            The index of the last sample to consider
+        weighted_n_node_samples : ndarray, dtype=double pointer
+            The total weight of those samples
+        """
+        pass
+
+    cdef int node_split(self, double impurity, SplitRecord* split,
+                        SIZE_t* n_constant_features) except -1 nogil:
+        """Find the best split on node samples[start:end].
+
+        This is a placeholder method. The majority of computation will be done
+        here.
+
+        It should return -1 upon errors.
+        """
+        pass
+
+    cdef void node_value(self, double* dest) noexcept nogil:
+        """Copy the value of node samples[start:end] into dest."""
+        pass
+
+    cdef double node_impurity(self) noexcept nogil:
+        """Return the impurity of the current node."""
+        pass
+
+    cdef int pointer_size(self) noexcept nogil:
+        """Size of the pointer for split records.
+        
+        Overriding this function allows one to use different subclasses of
+        `SplitRecord`.
+        """
+        return sizeof(SplitRecord)
+
+cdef class Splitter(BaseSplitter):
+    """Abstract interface for supervised splitters."""
+
     def __cinit__(self, Criterion criterion, SIZE_t max_features,
                   SIZE_t min_samples_leaf, double min_weight_leaf,
-                  object random_state):
+                  object random_state, *argv):
         """
         Parameters
         ----------
@@ -75,7 +140,6 @@ cdef class Splitter:
         random_state : object
             The user inputted random state to be used for pseudo-randomness
         """
-
         self.criterion = criterion
 
         self.n_samples = 0
@@ -86,11 +150,6 @@ cdef class Splitter:
         self.min_weight_leaf = min_weight_leaf
         self.random_state = random_state
 
-    def __getstate__(self):
-        return {}
-
-    def __setstate__(self, d):
-        pass
 
     def __reduce__(self):
         return (type(self), (self.criterion,
@@ -127,7 +186,6 @@ cdef class Splitter:
             are assumed to have uniform weight. This is represented
             as a Cython memoryview.
         """
-
         self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)
         cdef SIZE_t n_samples = X.shape[0]
 
@@ -165,6 +223,19 @@ cdef class Splitter:
         self.y = y
 
         self.sample_weight = sample_weight
+
+        self.criterion.init(
+            self.y,
+            self.sample_weight,
+            self.weighted_n_samples,
+            self.samples
+        )
+
+        self.criterion.set_sample_pointers(
+            self.start,
+            self.end
+        )
+
         return 0
 
     cdef int node_reset(self, SIZE_t start, SIZE_t end,
@@ -187,30 +258,11 @@ cdef class Splitter:
         self.start = start
         self.end = end
 
-        self.criterion.init(
-            self.y,
-            self.sample_weight,
-            self.weighted_n_samples,
-            self.samples,
-            start,
-            end
-        )
+        self.criterion.set_sample_pointers(start, end)
 
         weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples
         return 0
 
-    cdef int node_split(self, double impurity, SplitRecord* split,
-                        SIZE_t* n_constant_features) except -1 nogil:
-        """Find the best split on node samples[start:end].
-
-        This is a placeholder method. The majority of computation will be done
-        here.
-
-        It should return -1 upon errors.
-        """
-
-        pass
-
     cdef void node_value(self, double* dest) noexcept nogil:
         """Copy the value of node samples[start:end] into dest."""
 
@@ -221,6 +273,41 @@ cdef class Splitter:
 
         return self.criterion.node_impurity()
 
+    cdef bint check_presplit_conditions(
+        self,
+        SplitRecord current_split,
+    ) noexcept nogil:
+        """Check stopping conditions pre-split.
+        
+        This is typically a metric that is cheaply computed given the
+        current proposed split, which is stored as a the `current_split`
+        argument.
+        """
+        cdef SIZE_t min_samples_leaf = self.min_samples_leaf
+
+        if (((current_split.pos - self.start) < min_samples_leaf) or
+                ((self.end - current_split.pos) < min_samples_leaf)):
+            return 1
+        
+        return 0
+
+    cdef bint check_postsplit_conditions(
+        self
+    ) noexcept nogil:
+        """Check stopping conditions after evaluating the split.
+        
+        This takes some metric that is stored in the Criterion
+        object and checks against internal stop metrics.
+        """
+        cdef double min_weight_leaf = self.min_weight_leaf
+
+        # Reject if min_weight_leaf is not satisfied
+        if ((self.criterion.weighted_n_left < min_weight_leaf) or
+                (self.criterion.weighted_n_right < min_weight_leaf)):
+            return 1
+        
+        return 0
+
 # Introduce a fused-class to make it possible to share the split implementation
 # between the dense and sparse cases in the node_split_best and node_split_random
 # functions. The alternative would have been to use inheritance-based polymorphism
@@ -229,7 +316,7 @@ cdef class Splitter:
 ctypedef fused Partitioner:
     DensePartitioner
     SparsePartitioner
-
+    
 cdef inline int node_split_best(
     Splitter splitter,
     Partitioner partitioner,
@@ -349,15 +436,13 @@ cdef inline int node_split_best(
             current_split.pos = p
 
             # Reject if min_samples_leaf is not guaranteed
-            if (((current_split.pos - start) < min_samples_leaf) or
-                    ((end - current_split.pos) < min_samples_leaf)):
+            if splitter.check_presplit_conditions(current_split) == 1:
                 continue
 
             criterion.update(current_split.pos)
 
             # Reject if min_weight_leaf is not satisfied
-            if ((criterion.weighted_n_left < min_weight_leaf) or
-                    (criterion.weighted_n_right < min_weight_leaf)):
+            if splitter.check_postsplit_conditions() == 1:
                 continue
 
             current_proxy_improvement = criterion.proxy_impurity_improvement()
@@ -645,8 +730,7 @@ cdef inline int node_split_random(
         current_split.pos = partitioner.partition_samples(current_split.threshold)
 
         # Reject if min_samples_leaf is not guaranteed
-        if (((current_split.pos - start) < min_samples_leaf) or
-                ((end - current_split.pos) < min_samples_leaf)):
+        if splitter.check_presplit_conditions(current_split) == 1:
             continue
 
         # Evaluate split
@@ -656,8 +740,7 @@ cdef inline int node_split_random(
         criterion.update(current_split.pos)
 
         # Reject if min_weight_leaf is not satisfied
-        if ((criterion.weighted_n_left < min_weight_leaf) or
-                (criterion.weighted_n_right < min_weight_leaf)):
+        if splitter.check_postsplit_conditions() == 1:
             continue
 
         current_proxy_improvement = criterion.proxy_impurity_improvement()
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 1966651d8c89a..8140733a9fc26 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -13,6 +13,8 @@
 import numpy as np
 cimport numpy as cnp
 
+from libcpp.vector cimport vector
+
 ctypedef cnp.npy_float32 DTYPE_t          # Type of X
 ctypedef cnp.npy_float64 DOUBLE_t         # Type of y, sample_weight
 ctypedef cnp.npy_intp SIZE_t              # Type for indices and counters
@@ -33,40 +35,32 @@ cdef struct Node:
     SIZE_t n_node_samples                # Number of samples at the node
     DOUBLE_t weighted_n_node_samples     # Weighted number of samples at the node
 
-
-cdef class Tree:
-    # The Tree object is a binary tree structure constructed by the
-    # TreeBuilder. The tree structure is used for predictions and
-    # feature importances.
-
-    # Input/Output layout
-    cdef public SIZE_t n_features        # Number of features in X
-    cdef SIZE_t* n_classes               # Number of classes in y[:, k]
-    cdef public SIZE_t n_outputs         # Number of outputs in y
-    cdef public SIZE_t max_n_classes     # max(n_classes)
-
+cdef class BaseTree:
     # Inner structures: values are stored separately from node structure,
     # since size is determined at runtime.
     cdef public SIZE_t max_depth         # Max depth of the tree
     cdef public SIZE_t node_count        # Counter for node IDs
     cdef public SIZE_t capacity          # Capacity of tree, in terms of nodes
     cdef Node* nodes                     # Array of nodes
-    cdef double* value                   # (capacity, n_outputs, max_n_classes) array of values
-    cdef SIZE_t value_stride             # = n_outputs * max_n_classes
 
-    # Methods
-    cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
-                          SIZE_t feature, double threshold, double impurity,
-                          SIZE_t n_node_samples,
-                          double weighted_n_node_samples) except -1 nogil
+    cdef SIZE_t value_stride             # The dimensionality of a vectorized output per sample
+    cdef double* value                   # Array of values prediction values for each node        
+
+    # Generic Methods: These are generic methods used by any tree.
     cdef int _resize(self, SIZE_t capacity) except -1 nogil
     cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil
-
-    cdef cnp.ndarray _get_value_ndarray(self)
-    cdef cnp.ndarray _get_node_ndarray(self)
-
-    cpdef cnp.ndarray predict(self, object X)
-
+    cdef SIZE_t _add_node(
+        self,
+        SIZE_t parent,
+        bint is_left,
+        bint is_leaf,
+        SplitRecord* split_node,
+        double impurity,
+        SIZE_t n_node_samples,
+        double weighted_n_node_samples
+    ) except -1 nogil
+
+    # Python API methods: These are methods exposed to Python
     cpdef cnp.ndarray apply(self, object X)
     cdef cnp.ndarray _apply_dense(self, object X)
     cdef cnp.ndarray _apply_sparse_csr(self, object X)
@@ -78,6 +72,49 @@ cdef class Tree:
     cpdef compute_node_depths(self)
     cpdef compute_feature_importances(self, normalize=*)
 
+    # Abstract methods: these functions must be implemented by any decision tree
+    cdef int _set_split_node(
+        self,
+        SplitRecord* split_node,
+        Node* node
+    ) except -1 nogil
+    cdef int _set_leaf_node(
+        self,
+        SplitRecord* split_node,
+        Node* node
+    ) except -1 nogil
+    cdef DTYPE_t _compute_feature(
+        self,
+        const DTYPE_t[:, :] X_ndarray,
+        SIZE_t sample_index,
+        Node *node
+    ) noexcept nogil
+    cdef void _compute_feature_importances(
+        self,
+        cnp.float64_t[:] importances,
+        Node* node,
+    ) noexcept nogil
+
+cdef class Tree(BaseTree):
+    # The Supervised Tree object is a binary tree structure constructed by the
+    # TreeBuilder. The tree structure is used for predictions and
+    # feature importances.
+    # 
+    # Value of upstream properties:
+    # - value_stride = n_outputs * max_n_classes
+    # - value = (capacity, n_outputs, max_n_classes) array of values          
+
+    # Input/Output layout for supervised tree
+    cdef public SIZE_t n_features        # Number of features in X
+    cdef SIZE_t* n_classes               # Number of classes in y[:, k]
+    cdef public SIZE_t n_outputs         # Number of outputs in y
+    cdef public SIZE_t max_n_classes     # max(n_classes)
+
+    # Methods
+    cdef cnp.ndarray _get_value_ndarray(self)
+    cdef cnp.ndarray _get_node_ndarray(self)
+
+    cpdef cnp.ndarray predict(self, object X)
 
 # =============================================================================
 # Tree builder
@@ -91,8 +128,7 @@ cdef class TreeBuilder:
     # This class controls the various stopping criteria and the node splitting
     # evaluation order, e.g. depth-first or best-first.
 
-    cdef Splitter splitter              # Splitting algorithm
-
+    cdef Splitter splitter
     cdef SIZE_t min_samples_split       # Minimum number of samples in an internal node
     cdef SIZE_t min_samples_leaf        # Minimum number of samples in a leaf
     cdef double min_weight_leaf         # Minimum weight in a leaf
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 75eed058bfd4e..e5b759aee23df 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -22,6 +22,8 @@ from libcpp.vector cimport vector
 from libcpp.algorithm cimport pop_heap
 from libcpp.algorithm cimport push_heap
 from libcpp cimport bool
+from cython.operator cimport dereference as deref
+from libc.stdlib cimport malloc, free
 
 import struct
 
@@ -83,6 +85,7 @@ NODE_DTYPE = np.asarray(<Node[:1]>(&dummy)).dtype
 # TreeBuilder
 # =============================================================================
 
+
 cdef class TreeBuilder:
     """Interface for different tree building strategies."""
 
@@ -196,9 +199,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef bint is_left
         cdef SIZE_t n_node_samples = splitter.n_samples
         cdef double weighted_n_node_samples
-        cdef SplitRecord split
         cdef SIZE_t node_id
 
+        cdef SplitRecord split
+        cdef SplitRecord* split_ptr = <SplitRecord *>malloc(splitter.pointer_size())
+
         cdef double impurity = INFINITY
         cdef SIZE_t n_constant_features
         cdef bint is_leaf
@@ -248,7 +253,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 is_leaf = is_leaf or impurity <= EPSILON
 
                 if not is_leaf:
-                    splitter.node_split(impurity, &split, &n_constant_features)
+                    splitter.node_split(impurity, split_ptr, &n_constant_features)
+
+                    # assign local copy of SplitRecord to assign
+                    # pos, improvement, and impurity scores
+                    split = deref(split_ptr)
+
                     # If EPSILON=0 in the below comparison, float precision
                     # issues stop splitting, producing trees that are
                     # dissimilar to v0.18
@@ -256,8 +266,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                                (split.improvement + EPSILON <
                                 min_impurity_decrease))
 
-                node_id = tree._add_node(parent, is_left, is_leaf, split.feature,
-                                         split.threshold, impurity, n_node_samples,
+                node_id = tree._add_node(parent, is_left, is_leaf, split_ptr,
+                                         impurity, n_node_samples,
                                          weighted_n_node_samples)
 
                 if node_id == INTPTR_MAX:
@@ -297,6 +307,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
 
             if rc >= 0:
                 tree.max_depth = max_depth_seen
+        
+        # free the memory created for the SplitRecord pointer
+        free(split_ptr)
+
         if rc == -1:
             raise MemoryError()
 
@@ -462,6 +476,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                                     FrontierRecord* res) except -1 nogil:
         """Adds node w/ partition ``[start, end)`` to the frontier. """
         cdef SplitRecord split
+        cdef SplitRecord* split_ptr = <SplitRecord *>malloc(splitter.pointer_size())
+        
         cdef SIZE_t node_id
         cdef SIZE_t n_node_samples
         cdef SIZE_t n_constant_features = 0
@@ -483,7 +499,11 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                    )
 
         if not is_leaf:
-            splitter.node_split(impurity, &split, &n_constant_features)
+            splitter.node_split(impurity, split_ptr, &n_constant_features)
+            # assign local copy of SplitRecord to assign
+            # pos, improvement, and impurity scores
+            split = deref(split_ptr)
+
             # If EPSILON=0 in the below comparison, float precision issues stop
             # splitting early, producing trees that are dissimilar to v0.18
             is_leaf = (is_leaf or split.pos >= end or
@@ -493,7 +513,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                                  if parent != NULL
                                  else _TREE_UNDEFINED,
                                  is_left, is_leaf,
-                                 split.feature, split.threshold, impurity, n_node_samples,
+                                 split_ptr, impurity, n_node_samples,
                                  weighted_n_node_samples)
         if node_id == INTPTR_MAX:
             return -1
@@ -522,7 +542,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
             res.improvement = 0.0
             res.impurity_left = impurity
             res.impurity_right = impurity
-
+        
+        free(split_ptr)
         return 0
 
 
@@ -530,190 +551,15 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
 # Tree
 # =============================================================================
 
-cdef class Tree:
-    """Array-based representation of a binary decision tree.
-
-    The binary tree is represented as a number of parallel arrays. The i-th
-    element of each array holds information about the node `i`. Node 0 is the
-    tree's root. You can find a detailed description of all arrays in
-    `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split
-    nodes, resp. In this case the values of nodes of the other type are
-    arbitrary!
-
-    Attributes
-    ----------
-    node_count : int
-        The number of nodes (internal nodes + leaves) in the tree.
-
-    capacity : int
-        The current capacity (i.e., size) of the arrays, which is at least as
-        great as `node_count`.
-
-    max_depth : int
-        The depth of the tree, i.e. the maximum depth of its leaves.
-
-    children_left : array of int, shape [node_count]
-        children_left[i] holds the node id of the left child of node i.
-        For leaves, children_left[i] == TREE_LEAF. Otherwise,
-        children_left[i] > i. This child handles the case where
-        X[:, feature[i]] <= threshold[i].
-
-    children_right : array of int, shape [node_count]
-        children_right[i] holds the node id of the right child of node i.
-        For leaves, children_right[i] == TREE_LEAF. Otherwise,
-        children_right[i] > i. This child handles the case where
-        X[:, feature[i]] > threshold[i].
-
-    feature : array of int, shape [node_count]
-        feature[i] holds the feature to split on, for the internal node i.
-
-    threshold : array of double, shape [node_count]
-        threshold[i] holds the threshold for the internal node i.
-
-    value : array of double, shape [node_count, n_outputs, max_n_classes]
-        Contains the constant prediction value of each node.
-
-    impurity : array of double, shape [node_count]
-        impurity[i] holds the impurity (i.e., the value of the splitting
-        criterion) at node i.
-
-    n_node_samples : array of int, shape [node_count]
-        n_node_samples[i] holds the number of training samples reaching node i.
-
-    weighted_n_node_samples : array of double, shape [node_count]
-        weighted_n_node_samples[i] holds the weighted number of training samples
-        reaching node i.
+cdef class BaseTree:
+    """Base class for Cython tree models.
+    
+    Downstream classes must implement
     """
-    # Wrap for outside world.
-    # WARNING: these reference the current `nodes` and `value` buffers, which
-    # must not be freed by a subsequent memory allocation.
-    # (i.e. through `_resize` or `__setstate__`)
-    property n_classes:
-        def __get__(self):
-            return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)
-
-    property children_left:
-        def __get__(self):
-            return self._get_node_ndarray()['left_child'][:self.node_count]
-
-    property children_right:
-        def __get__(self):
-            return self._get_node_ndarray()['right_child'][:self.node_count]
-
-    property n_leaves:
-        def __get__(self):
-            return np.sum(np.logical_and(
-                self.children_left == -1,
-                self.children_right == -1))
-
-    property feature:
-        def __get__(self):
-            return self._get_node_ndarray()['feature'][:self.node_count]
-
-    property threshold:
-        def __get__(self):
-            return self._get_node_ndarray()['threshold'][:self.node_count]
-
-    property impurity:
-        def __get__(self):
-            return self._get_node_ndarray()['impurity'][:self.node_count]
-
-    property n_node_samples:
-        def __get__(self):
-            return self._get_node_ndarray()['n_node_samples'][:self.node_count]
-
-    property weighted_n_node_samples:
-        def __get__(self):
-            return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count]
-
-    property value:
-        def __get__(self):
-            return self._get_value_ndarray()[:self.node_count]
-
-    # TODO: Convert n_classes to cython.integral memory view once
-    #  https://github.com/cython/cython/issues/5243 is fixed
-    def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs):
-        """Constructor."""
-        cdef SIZE_t dummy = 0
-        size_t_dtype = np.array(dummy).dtype
-
-        n_classes = _check_n_classes(n_classes, size_t_dtype)
-
-        # Input/Output layout
-        self.n_features = n_features
-        self.n_outputs = n_outputs
-        self.n_classes = NULL
-        safe_realloc(&self.n_classes, n_outputs)
-
-        self.max_n_classes = np.max(n_classes)
-        self.value_stride = n_outputs * self.max_n_classes
-
-        cdef SIZE_t k
-        for k in range(n_outputs):
-            self.n_classes[k] = n_classes[k]
-
-        # Inner structures
-        self.max_depth = 0
-        self.node_count = 0
-        self.capacity = 0
-        self.value = NULL
-        self.nodes = NULL
-
-    def __dealloc__(self):
-        """Destructor."""
-        # Free all inner structures
-        free(self.n_classes)
-        free(self.value)
-        free(self.nodes)
-
-    def __reduce__(self):
-        """Reduce re-implementation, for pickling."""
-        return (Tree, (self.n_features,
-                       sizet_ptr_to_ndarray(self.n_classes, self.n_outputs),
-                       self.n_outputs), self.__getstate__())
-
-    def __getstate__(self):
-        """Getstate re-implementation, for pickling."""
-        d = {}
-        # capacity is inferred during the __setstate__ using nodes
-        d["max_depth"] = self.max_depth
-        d["node_count"] = self.node_count
-        d["nodes"] = self._get_node_ndarray()
-        d["values"] = self._get_value_ndarray()
-        return d
-
-    def __setstate__(self, d):
-        """Setstate re-implementation, for unpickling."""
-        self.max_depth = d["max_depth"]
-        self.node_count = d["node_count"]
-
-        if 'nodes' not in d:
-            raise ValueError('You have loaded Tree version which '
-                             'cannot be imported')
-
-        node_ndarray = d['nodes']
-        value_ndarray = d['values']
-
-        value_shape = (node_ndarray.shape[0], self.n_outputs,
-                       self.max_n_classes)
-
-        node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE)
-        value_ndarray = _check_value_ndarray(
-            value_ndarray,
-            expected_dtype=np.dtype(np.float64),
-            expected_shape=value_shape
-        )
-
-        self.capacity = node_ndarray.shape[0]
-        if self._resize_c(self.capacity) != 0:
-            raise MemoryError("resizing tree to %d" % self.capacity)
-
-        nodes = memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray),
-                       self.capacity * sizeof(Node))
-        value = memcpy(self.value, cnp.PyArray_DATA(value_ndarray),
-                       self.capacity * self.value_stride * sizeof(double))
-
-    cdef int _resize(self, SIZE_t capacity) except -1 nogil:
+    cdef int _resize(
+        self,
+        SIZE_t capacity
+    ) except -1 nogil:
         """Resize all inner arrays to `capacity`, if `capacity` == -1, then
            double the size of the inner arrays.
 
@@ -725,7 +571,10 @@ cdef class Tree:
             with gil:
                 raise MemoryError()
 
-    cdef int _resize_c(self, SIZE_t capacity=INTPTR_MAX) except -1 nogil:
+    cdef int _resize_c(
+        self,
+        SIZE_t capacity=INTPTR_MAX
+    ) except -1 nogil:
         """Guts of _resize
 
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
@@ -756,14 +605,87 @@ cdef class Tree:
         self.capacity = capacity
         return 0
 
-    cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
-                          SIZE_t feature, double threshold, double impurity,
-                          SIZE_t n_node_samples,
-                          double weighted_n_node_samples) except -1 nogil:
-        """Add a node to the tree.
+    cdef int _set_split_node(
+        self,
+        SplitRecord* split_node,
+        Node* node
+    ) except -1 nogil:
+        """Set split node data.
+        
+        Parameters
+        ----------
+        split_node : SplitRecord*
+            The pointer to the record of the split node data.
+        node : Node*
+            The pointer to the node that will hold the split node.
+        """
+        # left_child and right_child will be set later for a split node
+        node.feature = split_node.feature
+        node.threshold = split_node.threshold
+        return 1
 
+    cdef int _set_leaf_node(
+        self,
+        SplitRecord* split_node,
+        Node* node
+    ) except -1 nogil:
+        """Set leaf node data.
+        
+        Parameters
+        ----------
+        split_node : SplitRecord*
+            The pointer to the record of the leaf node data.
+        node : Node*
+            The pointer to the node that will hold the leaf node.
+        """
+        node.left_child = _TREE_LEAF
+        node.right_child = _TREE_LEAF
+        node.feature = _TREE_UNDEFINED
+        node.threshold = _TREE_UNDEFINED
+        return 1
+
+    cdef DTYPE_t _compute_feature(self, const DTYPE_t[:, :] X_ndarray,
+            SIZE_t sample_index,
+            Node *node) noexcept nogil:
+        """Compute feature from a given data matrix, X.
+
+        In axis-aligned trees, this is simply the value in the column of X
+        for this specific feature.
+        """
+        # the feature index
+        cdef DTYPE_t feature = X_ndarray[sample_index, node.feature]
+        return feature
+
+    cdef SIZE_t _add_node(
+        self, 
+        SIZE_t parent,
+        bint is_left,
+        bint is_leaf,
+        SplitRecord* split_node,
+        double impurity,
+        SIZE_t n_node_samples,
+        double weighted_n_node_samples
+    ) except -1 nogil:
+        """Add a node to the tree.
         The new node registers itself as the child of its parent.
-
+        Parameters
+        ----------
+        parent : SIZE_t
+            The index of the parent. If '_TREE_UNDEFINED', then the current
+            node is a root node.
+        is_left : bint
+            Whether or not the current node is to the left of the parent node.
+        is_leaf : bint
+            Whether or not the current node is a leaf node.
+        split_node : SplitRecord*
+            A pointer to a SplitRecord pointer address.
+        impurity : double
+            The impurity of the node to be added.
+        n_node_samples : SIZE_t
+            The number of samples in the node.
+        weighted_n_node_samples : double
+            The weight of the samples in the node.
+            
         Returns (size_t)(-1) on error.
         """
         cdef SIZE_t node_id = self.node_count
@@ -784,28 +706,18 @@ cdef class Tree:
                 self.nodes[parent].right_child = node_id
 
         if is_leaf:
-            node.left_child = _TREE_LEAF
-            node.right_child = _TREE_LEAF
-            node.feature = _TREE_UNDEFINED
-            node.threshold = _TREE_UNDEFINED
-
+            if self._set_leaf_node(split_node, node) != 1:
+                 with gil:
+                     raise RuntimeError
         else:
-            # left_child and right_child will be set later
-            node.feature = feature
-            node.threshold = threshold
+            if self._set_split_node(split_node, node) != 1:
+                 with gil:
+                     raise RuntimeError
 
         self.node_count += 1
 
         return node_id
 
-    cpdef cnp.ndarray predict(self, object X):
-        """Predict target for X."""
-        out = self._get_value_ndarray().take(self.apply(X), axis=0,
-                                             mode='clip')
-        if self.n_outputs == 1:
-            out = out.reshape(X.shape[0], self.max_n_classes)
-        return out
-
     cpdef cnp.ndarray apply(self, object X):
         """Finds the terminal region (=leaf node) for each sample in X."""
         if issparse(X):
@@ -835,13 +747,20 @@ cdef class Tree:
         cdef Node* node = NULL
         cdef SIZE_t i = 0
 
+        # the feature value
+        cdef DTYPE_t feature_value = 0
+
         with nogil:
             for i in range(n_samples):
                 node = self.nodes
+
                 # While node not a leaf
                 while node.left_child != _TREE_LEAF:
                     # ... and node.right_child != _TREE_LEAF:
-                    if X_ndarray[i, node.feature] <= node.threshold:
+                    
+                    # compute the feature value to compare against threshold
+                    feature_value = self._compute_feature(X_ndarray, i, node)
+                    if feature_value <= node.threshold:
                         node = &self.nodes[node.left_child]
                     else:
                         node = &self.nodes[node.right_child]
@@ -902,7 +821,6 @@ cdef class Tree:
                     # ... and node.right_child != _TREE_LEAF:
                     if feature_to_sample[node.feature] == i:
                         feature_value = X_sample[node.feature]
-
                     else:
                         feature_value = 0.
 
@@ -951,6 +869,9 @@ cdef class Tree:
         cdef Node* node = NULL
         cdef SIZE_t i = 0
 
+        # the feature index
+        cdef DOUBLE_t feature
+
         with nogil:
             for i in range(n_samples):
                 node = self.nodes
@@ -962,7 +883,9 @@ cdef class Tree:
                     indices[indptr[i + 1]] = <SIZE_t>(node - self.nodes)
                     indptr[i + 1] += 1
 
-                    if X_ndarray[i, node.feature] <= node.threshold:
+                    # compute the feature value to compare against threshold
+                    feature = self._compute_feature(X_ndarray, i, node)
+                    if feature <= node.threshold:
                         node = &self.nodes[node.left_child]
                     else:
                         node = &self.nodes[node.right_child]
@@ -1091,8 +1014,6 @@ cdef class Tree:
 
     cpdef compute_feature_importances(self, normalize=True):
         """Computes the importance of each feature (aka variable)."""
-        cdef Node* left
-        cdef Node* right
         cdef Node* nodes = self.nodes
         cdef Node* node = nodes
         cdef Node* end_node = node + self.node_count
@@ -1105,13 +1026,9 @@ cdef class Tree:
             while node != end_node:
                 if node.left_child != _TREE_LEAF:
                     # ... and node.right_child != _TREE_LEAF:
-                    left = &nodes[node.left_child]
-                    right = &nodes[node.right_child]
-
-                    importances[node.feature] += (
-                        node.weighted_n_node_samples * node.impurity -
-                        left.weighted_n_node_samples * left.impurity -
-                        right.weighted_n_node_samples * right.impurity)
+                    self._compute_feature_importances(
+                        importances, node)
+                        
                 node += 1
 
         for i in range(self.n_features):
@@ -1127,44 +1044,27 @@ cdef class Tree:
 
         return np.asarray(importances)
 
-    cdef cnp.ndarray _get_value_ndarray(self):
-        """Wraps value as a 3-d NumPy array.
-
-        The array keeps a reference to this Tree, which manages the underlying
-        memory.
+    cdef void _compute_feature_importances(
+        self,
+        cnp.float64_t[:] importances,
+        Node* node
+    ) noexcept nogil:
+        """Compute feature importances from a Node in the Tree.
+        
+        Wrapped in a private function to allow subclassing that
+        computes feature importances.
         """
-        cdef cnp.npy_intp shape[3]
-        shape[0] = <cnp.npy_intp> self.node_count
-        shape[1] = <cnp.npy_intp> self.n_outputs
-        shape[2] = <cnp.npy_intp> self.max_n_classes
-        cdef cnp.ndarray arr
-        arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value)
-        Py_INCREF(self)
-        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
-            raise ValueError("Can't initialize array.")
-        return arr
+        cdef Node* nodes = self.nodes
+        cdef Node* left
+        cdef Node* right
 
-    cdef cnp.ndarray _get_node_ndarray(self):
-        """Wraps nodes as a NumPy struct array.
+        left = &nodes[node.left_child]
+        right = &nodes[node.right_child]
 
-        The array keeps a reference to this Tree, which manages the underlying
-        memory. Individual fields are publicly accessible as properties of the
-        Tree.
-        """
-        cdef cnp.npy_intp shape[1]
-        shape[0] = <cnp.npy_intp> self.node_count
-        cdef cnp.npy_intp strides[1]
-        strides[0] = sizeof(Node)
-        cdef cnp.ndarray arr
-        Py_INCREF(NODE_DTYPE)
-        arr = PyArray_NewFromDescr(<PyTypeObject *> cnp.ndarray,
-                                   <cnp.dtype> NODE_DTYPE, 1, shape,
-                                   strides, <void*> self.nodes,
-                                   cnp.NPY_ARRAY_DEFAULT, None)
-        Py_INCREF(self)
-        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
-            raise ValueError("Can't initialize array.")
-        return arr
+        importances[node.feature] += (
+                        node.weighted_n_node_samples * node.impurity -
+                        left.weighted_n_node_samples * left.impurity -
+                        right.weighted_n_node_samples * right.impurity)
 
     def compute_partial_dependence(self, DTYPE_t[:, ::1] X,
                                    int[::1] target_features,
@@ -1273,6 +1173,237 @@ cdef class Tree:
                                  total_weight)
 
 
+cdef class Tree(BaseTree):
+    """Array-based representation of a binary decision tree.
+
+    The binary tree is represented as a number of parallel arrays. The i-th
+    element of each array holds information about the node `i`. Node 0 is the
+    tree's root. You can find a detailed description of all arrays in
+    `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split
+    nodes, resp. In this case the values of nodes of the other type are
+    arbitrary!
+
+    Attributes
+    ----------
+    node_count : int
+        The number of nodes (internal nodes + leaves) in the tree.
+
+    capacity : int
+        The current capacity (i.e., size) of the arrays, which is at least as
+        great as `node_count`.
+
+    max_depth : int
+        The depth of the tree, i.e. the maximum depth of its leaves.
+
+    children_left : array of int, shape [node_count]
+        children_left[i] holds the node id of the left child of node i.
+        For leaves, children_left[i] == TREE_LEAF. Otherwise,
+        children_left[i] > i. This child handles the case where
+        X[:, feature[i]] <= threshold[i].
+
+    children_right : array of int, shape [node_count]
+        children_right[i] holds the node id of the right child of node i.
+        For leaves, children_right[i] == TREE_LEAF. Otherwise,
+        children_right[i] > i. This child handles the case where
+        X[:, feature[i]] > threshold[i].
+
+    feature : array of int, shape [node_count]
+        feature[i] holds the feature to split on, for the internal node i.
+
+    threshold : array of double, shape [node_count]
+        threshold[i] holds the threshold for the internal node i.
+
+    value : array of double, shape [node_count, n_outputs, max_n_classes]
+        Contains the constant prediction value of each node.
+
+    impurity : array of double, shape [node_count]
+        impurity[i] holds the impurity (i.e., the value of the splitting
+        criterion) at node i.
+
+    n_node_samples : array of int, shape [node_count]
+        n_node_samples[i] holds the number of training samples reaching node i.
+
+    weighted_n_node_samples : array of double, shape [node_count]
+        weighted_n_node_samples[i] holds the weighted number of training samples
+        reaching node i.
+    """
+    # Wrap for outside world.
+    # WARNING: these reference the current `nodes` and `value` buffers, which
+    # must not be freed by a subsequent memory allocation.
+    # (i.e. through `_resize` or `__setstate__`)
+    property n_classes:
+        def __get__(self):
+            return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)
+
+    property children_left:
+        def __get__(self):
+            return self._get_node_ndarray()['left_child'][:self.node_count]
+
+    property children_right:
+        def __get__(self):
+            return self._get_node_ndarray()['right_child'][:self.node_count]
+
+    property n_leaves:
+        def __get__(self):
+            return np.sum(np.logical_and(
+                self.children_left == -1,
+                self.children_right == -1))
+
+    property feature:
+        def __get__(self):
+            return self._get_node_ndarray()['feature'][:self.node_count]
+
+    property threshold:
+        def __get__(self):
+            return self._get_node_ndarray()['threshold'][:self.node_count]
+
+    property impurity:
+        def __get__(self):
+            return self._get_node_ndarray()['impurity'][:self.node_count]
+
+    property n_node_samples:
+        def __get__(self):
+            return self._get_node_ndarray()['n_node_samples'][:self.node_count]
+
+    property weighted_n_node_samples:
+        def __get__(self):
+            return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count]
+
+    property value:
+        def __get__(self):
+            return self._get_value_ndarray()[:self.node_count]
+
+    # TODO: Convert n_classes to cython.integral memory view once
+    #  https://github.com/cython/cython/issues/5243 is fixed
+    def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs):
+        """Constructor."""
+        cdef SIZE_t dummy = 0
+        size_t_dtype = np.array(dummy).dtype
+
+        n_classes = _check_n_classes(n_classes, size_t_dtype)
+
+        # Input/Output layout
+        self.n_features = n_features
+        self.n_outputs = n_outputs
+        self.n_classes = NULL
+        safe_realloc(&self.n_classes, n_outputs)
+
+        self.max_n_classes = np.max(n_classes)
+        self.value_stride = n_outputs * self.max_n_classes
+
+        cdef SIZE_t k
+        for k in range(n_outputs):
+            self.n_classes[k] = n_classes[k]
+
+        # Inner structures
+        self.max_depth = 0
+        self.node_count = 0
+        self.capacity = 0
+        self.value = NULL
+        self.nodes = NULL
+
+    def __dealloc__(self):
+        """Destructor."""
+        # Free all inner structures
+        free(self.n_classes)
+        free(self.value)
+        free(self.nodes)
+
+    def __reduce__(self):
+        """Reduce re-implementation, for pickling."""
+        return (Tree, (self.n_features,
+                       sizet_ptr_to_ndarray(self.n_classes, self.n_outputs),
+                       self.n_outputs), self.__getstate__())
+
+    def __getstate__(self):
+        """Getstate re-implementation, for pickling."""
+        d = {}
+        # capacity is inferred during the __setstate__ using nodes
+        d["max_depth"] = self.max_depth
+        d["node_count"] = self.node_count
+        d["nodes"] = self._get_node_ndarray()
+        d["values"] = self._get_value_ndarray()
+        return d
+
+    def __setstate__(self, d):
+        """Setstate re-implementation, for unpickling."""
+        self.max_depth = d["max_depth"]
+        self.node_count = d["node_count"]
+
+        if 'nodes' not in d:
+            raise ValueError('You have loaded Tree version which '
+                             'cannot be imported')
+
+        node_ndarray = d['nodes']
+        value_ndarray = d['values']
+
+        value_shape = (node_ndarray.shape[0], self.n_outputs,
+                       self.max_n_classes)
+
+        node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE)
+        value_ndarray = _check_value_ndarray(
+            value_ndarray,
+            expected_dtype=np.dtype(np.float64),
+            expected_shape=value_shape
+        )
+
+        self.capacity = node_ndarray.shape[0]
+        if self._resize_c(self.capacity) != 0:
+            raise MemoryError("resizing tree to %d" % self.capacity)
+
+        nodes = memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray),
+                       self.capacity * sizeof(Node))
+        value = memcpy(self.value, cnp.PyArray_DATA(value_ndarray),
+                       self.capacity * self.value_stride * sizeof(double))
+
+    cdef cnp.ndarray _get_value_ndarray(self):
+        """Wraps value as a 3-d NumPy array.
+
+        The array keeps a reference to this Tree, which manages the underlying
+        memory.
+        """
+        cdef cnp.npy_intp shape[3]
+        shape[0] = <cnp.npy_intp> self.node_count
+        shape[1] = <cnp.npy_intp> self.n_outputs
+        shape[2] = <cnp.npy_intp> self.max_n_classes
+        cdef cnp.ndarray arr
+        arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value)
+        Py_INCREF(self)
+        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
+            raise ValueError("Can't initialize array.")
+        return arr
+
+    cdef cnp.ndarray _get_node_ndarray(self):
+        """Wraps nodes as a NumPy struct array.
+
+        The array keeps a reference to this Tree, which manages the underlying
+        memory. Individual fields are publicly accessible as properties of the
+        Tree.
+        """
+        cdef cnp.npy_intp shape[1]
+        shape[0] = <cnp.npy_intp> self.node_count
+        cdef cnp.npy_intp strides[1]
+        strides[0] = sizeof(Node)
+        cdef cnp.ndarray arr
+        Py_INCREF(NODE_DTYPE)
+        arr = PyArray_NewFromDescr(<PyTypeObject *> cnp.ndarray,
+                                   <cnp.dtype> NODE_DTYPE, 1, shape,
+                                   strides, <void*> self.nodes,
+                                   cnp.NPY_ARRAY_DEFAULT, None)
+        Py_INCREF(self)
+        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
+            raise ValueError("Can't initialize array.")
+        return arr
+
+    cpdef cnp.ndarray predict(self, object X):
+        """Predict target for X."""
+        out = self._get_value_ndarray().take(self.apply(X), axis=0,
+                                             mode='clip')
+        if self.n_outputs == 1:
+            out = out.reshape(X.shape[0], self.max_n_classes)
+        return out
+
+
 def _check_n_classes(n_classes, expected_dtype):
     if n_classes.ndim != 1:
         raise ValueError(
@@ -1755,6 +1886,8 @@ cdef _build_pruned_tree(
         stack[BuildPrunedRecord] prune_stack
         BuildPrunedRecord stack_record
 
+        SplitRecord split
+
     with nogil:
         # push root node onto stack
         prune_stack.push({"start": 0, "depth": 0, "parent": _TREE_UNDEFINED, "is_left": 0})
@@ -1771,8 +1904,12 @@ cdef _build_pruned_tree(
             is_leaf = leaves_in_subtree[orig_node_id]
             node = &orig_tree.nodes[orig_node_id]
 
+            # redefine to a SplitRecord to pass into _add_node
+            split.feature = node.feature
+            split.threshold = node.threshold
+
             new_node_id = tree._add_node(
-                parent, is_left, is_leaf, node.feature, node.threshold,
+                parent, is_left, is_leaf, &split,
                 node.impurity, node.n_node_samples,
                 node.weighted_n_node_samples)
 
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 1f3a9bf394b9b..69f948839259a 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -300,7 +300,7 @@ def test_xor():
         clf.fit(X, y)
         assert clf.score(X, y) == 1.0, "Failed with {0}".format(name)
 
-        clf = Tree(random_state=0, max_features=1)
+        clf = Tree(random_state=0, max_features=X.shape[1])
         clf.fit(X, y)
         assert clf.score(X, y) == 1.0, "Failed with {0}".format(name)
 
@@ -440,7 +440,7 @@ def test_importances():
     X, y = datasets.make_classification(
         n_samples=5000,
         n_features=10,
-        n_informative=3,
+        n_informative=4,
         n_redundant=0,
         n_repeated=0,
         shuffle=False,
@@ -455,7 +455,7 @@ def test_importances():
         n_important = np.sum(importances > 0.1)
 
         assert importances.shape[0] == 10, "Failed with {0}".format(name)
-        assert n_important == 3, "Failed with {0}".format(name)
+        assert n_important == 4, "Failed with {0}".format(name)
 
     # Check on iris that importances are the same for all builders
     clf = DecisionTreeClassifier(random_state=0)
@@ -466,9 +466,9 @@ def test_importances():
     assert_array_equal(clf.feature_importances_, clf2.feature_importances_)
 
 
-def test_importances_raises():
+@pytest.mark.parametrize("clf", [DecisionTreeClassifier()])
+def test_importances_raises(clf):
     # Check if variable importance before fit raises ValueError.
-    clf = DecisionTreeClassifier()
     with pytest.raises(ValueError):
         getattr(clf, "feature_importances_")
 
@@ -653,6 +653,7 @@ def test_min_samples_leaf():
         est.fit(X, y)
         out = est.tree_.apply(X)
         node_counts = np.bincount(out)
+
         # drop inner nodes
         leaf_count = node_counts[node_counts != 0]
         assert np.min(leaf_count) > 4, "Failed with {0}".format(name)
@@ -677,7 +678,7 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False):
     else:
         X = DATASETS[datasets]["X"].astype(np.float32)
     y = DATASETS[datasets]["y"]
-
+    rng = np.random.RandomState(42)
     weights = rng.rand(X.shape[0])
     total_weight = np.sum(weights)
 
@@ -828,7 +829,7 @@ def test_min_impurity_decrease():
         )
         # Check with a much lower value of 0.0001
         est3 = TreeEstimator(
-            max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=0
+            max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=1
         )
         # Check with a much lower value of 0.1
         est4 = TreeEstimator(
@@ -918,6 +919,7 @@ def test_pickle():
         est2 = pickle.loads(serialized_object)
         assert type(est2) == est.__class__
 
+        # score should match before/after pickling
         score2 = est2.score(X, y)
         assert (
             score == score2
@@ -1031,7 +1033,6 @@ def test_memory_layout():
         ALL_TREES.items(), [np.float64, np.float32]
     ):
         est = TreeEstimator(random_state=0)
-
         # Nothing
         X = np.asarray(iris.data, dtype=dtype)
         y = iris.target
@@ -1052,6 +1053,11 @@ def test_memory_layout():
         y = iris.target
         assert_array_equal(est.fit(X, y).predict(X), y)
 
+        # Strided
+        X = np.asarray(iris.data[::3], dtype=dtype)
+        y = iris.target[::3]
+        assert_array_equal(est.fit(X, y).predict(X), y)
+
         # csr matrix
         X = csr_matrix(iris.data, dtype=dtype)
         y = iris.target
@@ -1062,11 +1068,6 @@ def test_memory_layout():
         y = iris.target
         assert_array_equal(est.fit(X, y).predict(X), y)
 
-        # Strided
-        X = np.asarray(iris.data[::3], dtype=dtype)
-        y = iris.target[::3]
-        assert_array_equal(est.fit(X, y).predict(X), y)
-
 
 def test_sample_weight():
     # Check sample weighting.
@@ -1260,7 +1261,7 @@ def test_behaviour_constant_feature_after_splits():
     y = [0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3]
     for name, TreeEstimator in ALL_TREES.items():
         # do not check extra random trees
-        if "ExtraTree" not in name:
+        if all(_name not in name for _name in ["ExtraTree"]):
             est = TreeEstimator(random_state=0, max_features=1)
             est.fit(X, y)
             assert est.tree_.max_depth == 2
@@ -1586,6 +1587,7 @@ def check_min_weight_leaf_split_level(name):
     sample_weight = [0.2, 0.2, 0.2, 0.2, 0.2]
     _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight)
 
+    # skip for sparse inputs
     _check_min_weight_leaf_split_level(TreeEstimator, csc_matrix(X), y, sample_weight)
 
 
@@ -1644,6 +1646,7 @@ def check_decision_path(name):
     # Assert that leaves index are correct
     leaves = est.apply(X)
     leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)]
+
     assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples))
 
     # Ensure only one leave node per sample
@@ -1930,6 +1933,7 @@ def assert_is_subtree(tree, subtree):
 def test_apply_path_readonly_all_trees(name, splitter, X_format):
     dataset = DATASETS["clf_small"]
     X_small = dataset["X"].astype(tree._tree.DTYPE, copy=False)
+
     if X_format == "dense":
         X_readonly = create_memmap_backed_data(X_small)
     else:

From 475bd05f779a4be4f301f751ac86ba6a998a219a Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 29 Mar 2023 09:41:10 -0700
Subject: [PATCH 02/39] Docs (#39)

#### Reference Issues/PRs
Fixes README and wheel building


---------

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 README.rst                                  | 36 ++++++++++++---------
 build_tools/azure/install.sh                |  2 +-
 build_tools/github/repair_windows_wheels.sh |  2 +-
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/README.rst b/README.rst
index fbdfdaa95ef4c..7a7bd41c42846 100644
--- a/README.rst
+++ b/README.rst
@@ -44,6 +44,10 @@
 .. |PytestMinVersion| replace:: 5.3.1
 .. |PlotlyMinVersion| replace:: 5.10.0
 
+=================
+Scikit-learn-tree
+=================
+
 ``scikit-learn-tree`` is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line
 with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is
 released under the name ``scikit-learn-tree`` to avoid confusion.
@@ -94,8 +98,7 @@ Installing scikit-learn-tree
 ============================
 
 Scikit-learn-tree is a maintained fork of scikit-learn, which extends the
-tree submodule in a few ways documented in :ref:`changelog of the fork
-<fork-changelog>`. 
+tree submodule in a few ways documented in `fork_changelog`_. 
 
 We release versions of scikit-learn-tree in an analagous fashion to
 scikit-learn main. Due to maintenance resources, we only release on PyPi
@@ -103,12 +106,11 @@ and recommend therefore installing with ``pip``.
 
 There are different ways to install scikit-learn-tree:
 
-  * :ref:`Install the latest official release <install_fork_release>`. This
+  * Install the latest official release `install_fork_release`_. This
     is the best approach for most users. It will provide a stable version
     and pre-built packages are available for most platforms.
     
-  * :ref:`Building the package from source
-    <install_source>`. This is best for users who want the
+  * Building the package from source `install_source`_. This is best for users who want the
     latest-and-greatest features and aren't afraid of running
     brand-new code. This is also needed for users who wish to contribute to the
     project.
@@ -119,9 +121,7 @@ Installing the latest release
 -----------------------------
 We release wheels for common distributions and this is thus installable via pip.
 
-.. prompt:: bash $
-  
-  pip install scikit-learn-tree
+    pip install scikit-learn-tree
 
 This will install ``scikit-learn-tree`` under the namespace of ``sklearn``, which then
 can be used as a stand-in for any package that relies on the public API of ``sklearn``.
@@ -146,9 +146,11 @@ features to the fork, the building from source instructions are exactly the same
 as that of scikit-learn main, so please refer to `scikit-learn documentation <https://scikit-learn.org/stable/developers/advanced_installation.html#install-bleeding-edge>`_
 for instructions on building from source.
 
-Development
 ===========
 
+Development
+-----------
+
 We welcome new contributors of all experience levels, specifically to maintain the fork.
 Any contributions that make sure our fork is "better in-line" with scikit-learn upstream,
 or improves the tree submodule in anyway will be appreciated.
@@ -158,15 +160,17 @@ The scikit-learn community goals are to be helpful, welcoming, and effective. Th
 has detailed information about contributing code, documentation, tests, and
 more. We've included some basic information in this README.
 
-.. _fork-changelog:
-Major Changes of the Fork
 =========================
 
+.. _fork_changelog:
+
+Major Changes of the Fork
+-------------------------
+
 The purpose of this page is to illustrate some of the main features that
 ``scikit-learn-tree`` provides compared to ``scikit-learn``. It assumes a
 an understanding of core package ``scikit-learn`` and also decision trees
-models. Please refer to our :ref:`installation instructions
-<fork-installation-instructions>` for installing ``scikit-learn-tree``.
+models. Please refer to our installation instructions `install_fork_release`_ for installing ``scikit-learn-tree``.
 
 Scikit-learn-tree though operates as a stand-in for upstream ``scikit-learn``.
 It is used in packages exactly the same way and will support all features
@@ -193,7 +197,7 @@ Candidate changes and PRs accepted into the fork are those that:
 Decision tree generalizations
 -----------------------------
 
-``Scikit-learn`` provides an axis-aligned :class:`~sklearn.tree.DecisionTreeClassifier`
+``Scikit-learn`` provides an axis-aligned `sklearn.tree.DecisionTreeClassifier <https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html>`_
 decision tree model (classifier and regressor), which has a few fundamental limitations
 that prevent 3rd parties from utilizing the existing class, without forking a large
 amount of copy/pasted Python and Cython code. We highlight those limitations here
@@ -239,8 +243,8 @@ Python API:
   random forests and their variants to scale to millions of samples.
   - Our fix: We added a ``max_bins=None`` keyword argument to the ``BaseForest`` class, and all its subclasses. The default behavior is no binning. The current implementation is not necessarily efficient. There are several improvements to be made. See below.
 
-Overall, the existing tree models, such as :class:`~sklearn.tree.DecisionTreeClassifier`
-and :class:`~sklearn.ensemble.RandomForestClassifier` all work exactly the same as they
+Overall, the existing tree models, such as `sklearn.tree.DecisionTreeClassifier <https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html>`_
+and `sklearn.ensemble.RandomForestClassifier <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier>`_ all work exactly the same as they
 would in ``scikit-learn`` main, but these extensions enable 3rd-party packages to extend
 the Cython/Python API easily.
 
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index db5b5d9414053..5238cd1121d2e 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -7,7 +7,7 @@ set -x
 source build_tools/shared.sh
 
 UNAMESTR=`uname`
-CCACHE_LINKS_DIR="/tmp/ccachev2"
+CCACHE_LINKS_DIR="/tmp/ccache"
 
 setup_ccache() {
     CCACHE_BIN=`which ccache || echo ""`
diff --git a/build_tools/github/repair_windows_wheels.sh b/build_tools/github/repair_windows_wheels.sh
index cdd0c0c79d8c4..a857e61067960 100755
--- a/build_tools/github/repair_windows_wheels.sh
+++ b/build_tools/github/repair_windows_wheels.sh
@@ -9,7 +9,7 @@ DEST_DIR=$2
 # By default, the Windows wheels are not repaired.
 # In this case, we need to vendor VCRUNTIME140.dll
 wheel unpack "$WHEEL"
-WHEEL_DIRNAME=$(ls -d scikit_learn-*)
+WHEEL_DIRNAME=$(ls -d scikit_learn_tree-*)
 python build_tools/github/vendor.py "$WHEEL_DIRNAME"
 wheel pack "$WHEEL_DIRNAME" -d "$DEST_DIR"
 rm -rf "$WHEEL_DIRNAME"

From 706a74273bf736066b1d71eeed9da08c0943e311 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 4 Apr 2023 14:47:24 -0700
Subject: [PATCH 03/39] Release v1.2.2

<!--
Thanks for contributing a pull request! Please ensure you have taken a
look at
the contribution guidelines:
https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md
-->

#### Reference Issues/PRs
<!--
Example: Fixes #1234. See also #3456.
Please use keywords (e.g., Fixes) to create link to the issues or pull
requests
you resolved, so that they will automatically be closed when your pull
request
is merged. See
https://github.com/blog/1506-closing-issues-via-pull-requests
-->


#### What does this implement/fix? Explain your changes.


#### Any other comments?


<!--
Please be aware that we are a loose team of volunteers so patience is
necessary; assistance handling other issues is very welcome. We value
all user contributions, no matter how minor they are. If we are slow to
review, either the pull request needs some benchmarking, tinkering,
convincing, etc. or more likely the reviewers are simply busy. In either
case, we ask for your understanding during the review process.
For more information, see our FAQ on this topic:

http://scikit-learn.org/dev/faq.html#why-is-my-pull-request-not-getting-any-attention.

Thanks for contributing!
-->

---------

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 .github/workflows/check-upstream.yml | 27 +++++++++++++++++++++++++++
 sklearn/__init__.py                  |  2 +-
 2 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/check-upstream.yml

diff --git a/.github/workflows/check-upstream.yml b/.github/workflows/check-upstream.yml
new file mode 100644
index 0000000000000..80e8ace610607
--- /dev/null
+++ b/.github/workflows/check-upstream.yml
@@ -0,0 +1,27 @@
+# Create Github Actions workflow that checks upstream scikit-learn 'main' branch and
+# creates or updates
+# an existing pull request to https://github.com/neurodata/scikit-learn:fork.
+# Runs the check weekly.
+# Creates a pull request if there are changes.
+
+# name: Check upstream scikit-learn
+
+# on:
+#   schedule:
+#     - cron: '0 0 * * 0'
+
+# jobs:
+#   check-upstream:
+#     runs-on: ubuntu-latest
+#     steps:
+#       - uses: actions/checkout@v2
+#       - name: Check upstream scikit-learn
+#         uses: neurodata/check-upstream@main
+#         with:
+#           upstream: scikit-learn/scikit-learn
+#           fork: neurodata/scikit-learn
+#           branch: fork
+#           token: ${{ secrets.GITHUB_TOKEN }}
+
+# # Creates a pull request if there are changes.
+
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 47bb893bd00a0..6d5af7c771fb8 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -39,7 +39,7 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = "1.3.dev0"
+__version__ = "1.2.2"
 
 
 # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded

From a22db039704399a31d466be861f2b5a86bbc51b3 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 11 Apr 2023 15:25:44 -0400
Subject: [PATCH 04/39] Update README

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 README.rst          | 4 ++--
 sklearn/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index 7a7bd41c42846..444ead93017b9 100644
--- a/README.rst
+++ b/README.rst
@@ -48,7 +48,7 @@
 Scikit-learn-tree
 =================
 
-``scikit-learn-tree`` is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line
+``scikit-learn-tree`` is an alias of scikit-learn. It is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line
 with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is
 released under the name ``scikit-learn-tree`` to avoid confusion.
 
@@ -85,7 +85,7 @@ Installation
 Dependencies
 ~~~~~~~~~~~~
 
-scikit-learn requires:
+scikit-learn-tree requires:
 
 - Python (>= |PythonMinVersion|)
 - NumPy (>= |NumPyMinVersion|)
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 6d5af7c771fb8..4d7badd6b678e 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -39,7 +39,7 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = "1.2.2"
+__version__ = "1.3.0dev0"
 
 
 # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded

From 9c5321daa396e0fd01cc6e582a5dfcc8ccb1afe5 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 8 Jun 2023 10:09:58 -0400
Subject: [PATCH 05/39] Adding working submodule

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_tree.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 23b999d76326e..f4a1a80123d26 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -1022,6 +1022,7 @@ cdef class BaseTree:
         cdef Node* end_node = node + self.node_count
 
         cdef double normalizer = 0.
+        cdef int i = 0
 
         cdef cnp.float64_t[:] importances = np.zeros(self.n_features)
 

From f82f2582c0c5e347fd9a6109129c3ae7853b0593 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 8 Jun 2023 10:40:52 -0400
Subject: [PATCH 06/39] Merged main

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_splitter.pxd | 2 ++
 sklearn/tree/_splitter.pyx | 4 ++--
 sklearn/tree/_tree.pyx     | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 3419c6fa08819..01975df22ef23 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -110,6 +110,8 @@ cdef class Splitter(BaseSplitter):
     cdef bint check_presplit_conditions(
         self,
         SplitRecord current_split,
+        SIZE_t n_missing,
+        bint missing_go_to_left,
     ) noexcept nogil
     cdef bint check_postsplit_conditions(
         self
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index c8df3de1bb900..ae6cd772e37f7 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -505,7 +505,7 @@ cdef inline int node_split_best(
                 current_split.pos = p
                 
                 # Reject if min_samples_leaf is not guaranteed
-                if splitter.check_presplit_conditions(current_split) == 1:
+                if splitter.check_presplit_conditions(current_split, n_missing, missing_go_to_left) == 1:
                     continue
 
                 criterion.update(current_split.pos)
@@ -834,7 +834,7 @@ cdef inline int node_split_random(
         current_split.pos = partitioner.partition_samples(current_split.threshold)
 
         # Reject if min_samples_leaf is not guaranteed
-        if splitter.check_presplit_conditions(current_split) == 1:
+        if splitter.check_presplit_conditions(current_split, 0, 0) == 1:
             continue
 
         # Evaluate split
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index c8248ed65c36b..33a2a8308de5f 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -764,7 +764,7 @@ cdef class BaseTree:
 
                 # While node not a leaf
                 while node.left_child != _TREE_LEAF:
-                    X_i_node_features = self._compute_feature(X_ndarray, i, node)
+                    X_i_node_feature = self._compute_feature(X_ndarray, i, node)
                     # ... and node.right_child != _TREE_LEAF:
                     if isnan(X_i_node_feature):
                         if node.missing_go_to_left:

From 7e38502806e954d9b3084f8a5e22602556236fe4 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 8 Jun 2023 10:42:44 -0400
Subject: [PATCH 07/39] Successful merge with the missing value support

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py        |  2 ++
 sklearn/tree/tests/test_tree.py | 32 ++++++++++++++------------------
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 638c51f1101bc..21fa5b7c200b2 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -388,6 +388,7 @@ def _fit(
             X,
             y,
             sample_weight,
+            feature_has_missing,
             min_samples_leaf,
             min_weight_leaf,
             max_leaf_nodes,
@@ -403,6 +404,7 @@ def _build_tree(
         X,
         y,
         sample_weight,
+        feature_has_missing,
         min_samples_leaf,
         min_weight_leaf,
         max_leaf_nodes,
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 6be168e4c8e7c..eefae6cdaa3f6 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -300,7 +300,7 @@ def test_xor():
         clf.fit(X, y)
         assert clf.score(X, y) == 1.0, "Failed with {0}".format(name)
 
-        clf = Tree(random_state=0, max_features=X.shape[1])
+        clf = Tree(random_state=0, max_features=1)
         clf.fit(X, y)
         assert clf.score(X, y) == 1.0, "Failed with {0}".format(name)
 
@@ -440,7 +440,7 @@ def test_importances():
     X, y = datasets.make_classification(
         n_samples=5000,
         n_features=10,
-        n_informative=4,
+        n_informative=3,
         n_redundant=0,
         n_repeated=0,
         shuffle=False,
@@ -455,7 +455,7 @@ def test_importances():
         n_important = np.sum(importances > 0.1)
 
         assert importances.shape[0] == 10, "Failed with {0}".format(name)
-        assert n_important == 4, "Failed with {0}".format(name)
+        assert n_important == 3, "Failed with {0}".format(name)
 
     # Check on iris that importances are the same for all builders
     clf = DecisionTreeClassifier(random_state=0)
@@ -466,9 +466,9 @@ def test_importances():
     assert_array_equal(clf.feature_importances_, clf2.feature_importances_)
 
 
-@pytest.mark.parametrize("clf", [DecisionTreeClassifier()])
-def test_importances_raises(clf):
+def test_importances_raises():
     # Check if variable importance before fit raises ValueError.
+    clf = DecisionTreeClassifier()
     with pytest.raises(ValueError):
         getattr(clf, "feature_importances_")
 
@@ -653,7 +653,6 @@ def test_min_samples_leaf():
         est.fit(X, y)
         out = est.tree_.apply(X)
         node_counts = np.bincount(out)
-
         # drop inner nodes
         leaf_count = node_counts[node_counts != 0]
         assert np.min(leaf_count) > 4, "Failed with {0}".format(name)
@@ -678,7 +677,7 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False):
     else:
         X = DATASETS[datasets]["X"].astype(np.float32)
     y = DATASETS[datasets]["y"]
-    rng = np.random.RandomState(42)
+
     weights = rng.rand(X.shape[0])
     total_weight = np.sum(weights)
 
@@ -829,7 +828,7 @@ def test_min_impurity_decrease():
         )
         # Check with a much lower value of 0.0001
         est3 = TreeEstimator(
-            max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=1
+            max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=0
         )
         # Check with a much lower value of 0.1
         est4 = TreeEstimator(
@@ -919,7 +918,6 @@ def test_pickle():
         est2 = pickle.loads(serialized_object)
         assert type(est2) == est.__class__
 
-        # score should match before/after pickling
         score2 = est2.score(X, y)
         assert (
             score == score2
@@ -1033,6 +1031,7 @@ def test_memory_layout():
         ALL_TREES.items(), [np.float64, np.float32]
     ):
         est = TreeEstimator(random_state=0)
+
         # Nothing
         X = np.asarray(iris.data, dtype=dtype)
         y = iris.target
@@ -1053,11 +1052,6 @@ def test_memory_layout():
         y = iris.target
         assert_array_equal(est.fit(X, y).predict(X), y)
 
-        # Strided
-        X = np.asarray(iris.data[::3], dtype=dtype)
-        y = iris.target[::3]
-        assert_array_equal(est.fit(X, y).predict(X), y)
-
         # csr matrix
         X = csr_matrix(iris.data, dtype=dtype)
         y = iris.target
@@ -1068,6 +1062,11 @@ def test_memory_layout():
         y = iris.target
         assert_array_equal(est.fit(X, y).predict(X), y)
 
+        # Strided
+        X = np.asarray(iris.data[::3], dtype=dtype)
+        y = iris.target[::3]
+        assert_array_equal(est.fit(X, y).predict(X), y)
+
 
 def test_sample_weight():
     # Check sample weighting.
@@ -1261,7 +1260,7 @@ def test_behaviour_constant_feature_after_splits():
     y = [0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3]
     for name, TreeEstimator in ALL_TREES.items():
         # do not check extra random trees
-        if all(_name not in name for _name in ["ExtraTree"]):
+        if "ExtraTree" not in name:
             est = TreeEstimator(random_state=0, max_features=1)
             est.fit(X, y)
             assert est.tree_.max_depth == 2
@@ -1587,7 +1586,6 @@ def check_min_weight_leaf_split_level(name):
     sample_weight = [0.2, 0.2, 0.2, 0.2, 0.2]
     _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight)
 
-    # skip for sparse inputs
     _check_min_weight_leaf_split_level(TreeEstimator, csc_matrix(X), y, sample_weight)
 
 
@@ -1646,7 +1644,6 @@ def check_decision_path(name):
     # Assert that leaves index are correct
     leaves = est.apply(X)
     leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)]
-
     assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples))
 
     # Ensure only one leave node per sample
@@ -1933,7 +1930,6 @@ def assert_is_subtree(tree, subtree):
 def test_apply_path_readonly_all_trees(name, splitter, X_format):
     dataset = DATASETS["clf_small"]
     X_small = dataset["X"].astype(tree._tree.DTYPE, copy=False)
-
     if X_format == "dense":
         X_readonly = create_memmap_backed_data(X_small)
     else:

From 34a562130d9c92b083b6da99c27a12a7623226b7 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 8 Jun 2023 10:53:07 -0400
Subject: [PATCH 08/39] Add cyton headers

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_criterion.pyx | 3 +++
 sklearn/tree/_splitter.pyx  | 3 +++
 sklearn/tree/_tree.pyx      | 3 +++
 sklearn/tree/_utils.pyx     | 3 +++
 4 files changed, 12 insertions(+)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 9c59e75fedb10..8fbcafcaf1456 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1,3 +1,6 @@
+# cython: language_level=3
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
+
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Brian Holt <bdholt1@gmail.com>
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index ae6cd772e37f7..a58514d093ddf 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -1,3 +1,6 @@
+# cython: language_level=3
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
+
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Brian Holt <bdholt1@gmail.com>
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 33a2a8308de5f..2256b28c7df10 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -1,3 +1,6 @@
+# cython: language_level=3
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
+
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Brian Holt <bdholt1@gmail.com>
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 669d69409fdc3..0a7522bcf4255 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -1,3 +1,6 @@
+# cython: language_level=3
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
+
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Arnaud Joly <arnaud.v.joly@gmail.com>

From f35c758189c8d38bfed56071b8c9a6cbbd39056f Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 8 Jun 2023 14:04:19 -0400
Subject: [PATCH 09/39] Fix imports to be absolute

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py | 32 ++++++++++++++++----------------
 sklearn/tree/_export.py  | 11 ++++++++---
 sklearn/tree/_utils.pxd  |  2 +-
 sklearn/tree/_utils.pyx  |  2 +-
 4 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 21fa5b7c200b2..4fdd8f27cd652 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -25,22 +25,22 @@
 import numpy as np
 from scipy.sparse import issparse
 
-from ..base import BaseEstimator
-from ..base import ClassifierMixin
-from ..base import clone
-from ..base import RegressorMixin
-from ..base import is_classifier
-from ..base import MultiOutputMixin
-from ..utils import Bunch
-from ..utils import check_random_state
-from ..utils.validation import _check_sample_weight
-from ..utils.validation import assert_all_finite
-from ..utils.validation import _assert_all_finite_element_wise
-from ..utils import compute_sample_weight
-from ..utils.multiclass import check_classification_targets
-from ..utils.validation import check_is_fitted
-from ..utils._param_validation import Hidden, Interval, StrOptions
-from ..utils._param_validation import RealNotInt
+from sklearn.base import BaseEstimator
+from sklearn.base import ClassifierMixin
+from sklearn.base import clone
+from sklearn.base import RegressorMixin
+from sklearn.base import is_classifier
+from sklearn.base import MultiOutputMixin
+from sklearn.utils import Bunch
+from sklearn.utils import check_random_state
+from sklearn.utils.validation import _check_sample_weight
+from sklearn.utils.validation import assert_all_finite
+from sklearn.utils.validation import _assert_all_finite_element_wise
+from sklearn.utils import compute_sample_weight
+from sklearn.utils.multiclass import check_classification_targets
+from sklearn.utils.validation import check_is_fitted
+from sklearn.utils._param_validation import Hidden, Interval, StrOptions
+from sklearn.utils._param_validation import RealNotInt
 
 from ._criterion import BaseCriterion
 from ._splitter import BaseSplitter
diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py
index e8dbe51138223..be545de0202d0 100644
--- a/sklearn/tree/_export.py
+++ b/sklearn/tree/_export.py
@@ -16,10 +16,15 @@
 
 import numpy as np
 
-from ..utils.validation import check_is_fitted, check_array
-from ..utils._param_validation import Interval, validate_params, StrOptions, HasMethods
+from sklearn.utils.validation import check_is_fitted, check_array
+from sklearn.utils._param_validation import (
+    Interval,
+    validate_params,
+    StrOptions,
+    HasMethods,
+)
 
-from ..base import is_classifier
+from sklearn.base import is_classifier
 
 from . import _criterion
 from . import _tree
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 4938d3030245f..f7bae4c5c8553 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -10,7 +10,7 @@
 
 cimport numpy as cnp
 from ._tree cimport Node
-from ..neighbors._quad_tree cimport Cell
+from sklearn.neighbors._quad_tree cimport Cell
 
 ctypedef cnp.npy_float32 DTYPE_t          # Type of X
 ctypedef cnp.npy_float64 DOUBLE_t         # Type of y, sample_weight
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 0a7522bcf4255..bc7e17f8766d8 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -19,7 +19,7 @@ import numpy as np
 cimport numpy as cnp
 cnp.import_array()
 
-from ..utils._random cimport our_rand_r
+from sklearn.utils._random cimport our_rand_r
 
 # =============================================================================
 # Helper functions

From 45320b4d3ef05b4ccbe81e8c13676b1c755d1973 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 8 Jun 2023 14:17:25 -0400
Subject: [PATCH 10/39] Fix forest import

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 4cc672bb6884d..4d9bf862bd806 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -50,11 +50,16 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from scipy.sparse import issparse
 from scipy.sparse import hstack as sparse_hstack
 
-from ..base import is_classifier
-from ..base import ClassifierMixin, MultiOutputMixin, RegressorMixin, TransformerMixin
+from sklearn.base import is_classifier
+from sklearn.base import (
+    ClassifierMixin,
+    MultiOutputMixin,
+    RegressorMixin,
+    TransformerMixin,
+)
 
-from ..metrics import accuracy_score, r2_score
-from ..preprocessing import OneHotEncoder
+from sklearn.metrics import accuracy_score, r2_score
+from sklearn.preprocessing import OneHotEncoder
 from ..tree import (
     BaseDecisionTree,
     DecisionTreeClassifier,
@@ -63,21 +68,21 @@ class calls the ``fit`` method of each sub-estimator on random samples
     ExtraTreeRegressor,
 )
 from ..tree._tree import DTYPE, DOUBLE
-from ..utils import check_random_state, compute_sample_weight
-from ..exceptions import DataConversionWarning
-from ._base import BaseEnsemble, _partition_estimators
-from ..utils.parallel import delayed, Parallel
-from ..utils.multiclass import check_classification_targets, type_of_target
-from ..utils.validation import (
+from sklearn.utils import check_random_state, compute_sample_weight
+from sklearn.exceptions import DataConversionWarning
+from sklearn.ensemble._base import BaseEnsemble, _partition_estimators
+from sklearn.utils.parallel import delayed, Parallel
+from sklearn.utils.multiclass import check_classification_targets, type_of_target
+from sklearn.utils.validation import (
     check_is_fitted,
     _check_sample_weight,
     _check_feature_names_in,
 )
-from ..utils._openmp_helpers import _openmp_effective_n_threads
-from ..utils.validation import _num_samples
-from ..utils._param_validation import Interval, StrOptions
-from ..utils._param_validation import RealNotInt
-from ._hist_gradient_boosting.binning import _BinMapper
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+from sklearn.utils.validation import _num_samples
+from sklearn.utils._param_validation import Interval, StrOptions
+from sklearn.utils._param_validation import RealNotInt
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
 
 __all__ = [
     "RandomForestClassifier",

From 49526f026c46727aa272be7bdd7a44d0101c089f Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 13 Jun 2023 15:19:07 -0400
Subject: [PATCH 11/39] Fix classes and criterion

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py    | 67 ++++++++++++++++++++++++++++++++++++
 sklearn/tree/_criterion.pxd | 11 ++++--
 sklearn/tree/_criterion.pyx | 68 +++++++++++++++++++++++++++++++++++--
 3 files changed, 141 insertions(+), 5 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 4fdd8f27cd652..795c68c8b5081 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -713,6 +713,73 @@ def feature_importances_(self):
 
         return self.tree_.compute_feature_importances()
 
+    def _get_y_for_leaves(self, X, sample_weight=None):
+        n_samples = X.shape[0]
+
+        # get the predictions
+        X_leaves = self.apply(X)
+
+        bootstrap_indices = np.empty(shape, dtype=np.int64)
+        for i, estimator in enumerate(self.estimators_):
+            # Get bootstrap indices.
+            if self.bootstrap:
+                n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples)
+                bootstrap_indices[:, i] = _generate_sample_indices(
+                    estimator.random_state, n_samples, n_samples_bootstrap
+                )
+            else:
+                bootstrap_indices[:, i] = np.arange(n_samples)
+
+            # Get predictions on bootstrap indices.
+            X_leaves[:, i] = X_leaves[bootstrap_indices[:, i], i]
+
+        if sorter is not None:
+            # Reassign bootstrap indices to account for target sorting.
+            bootstrap_indices = np.argsort(sorter)[bootstrap_indices]
+
+        bootstrap_indices += 1  # for sparse matrix (0s as empty)
+
+        # Get the maximum number of nodes (internal + leaves) across trees.
+        # Get the maximum number of samples per leaf across trees (if needed).
+        max_node_count = 0
+        max_samples_leaf = 0 if not leaf_subsample else max_samples_leaf
+        for i, estimator in enumerate(self.estimators_):
+            node_count = estimator.tree_.node_count
+            if node_count > max_node_count:
+                max_node_count = node_count
+            if not leaf_subsample:
+                sample_count = np.max(np.bincount(X_leaves[:, i]))
+                if sample_count > max_samples_leaf:
+                    max_samples_leaf = sample_count
+
+        # Initialize NumPy array (more efficient serialization than dict/list).
+        shape = (self.n_estimators, max_node_count, max_samples_leaf)
+        y_train_leaves = np.zeros(shape, dtype=np.int64)
+
+        for i, estimator in enumerate(self.estimators_):
+            # Group training indices by leaf node.
+            leaf_indices, leaf_values_list = _group_by_value(X_leaves[:, i])
+
+            if leaf_subsample:
+                random.seed(estimator.random_state)
+
+            # Map each leaf node to its list of training indices.
+            for leaf_idx, leaf_values in zip(leaf_indices, leaf_values_list):
+                y_indices = bootstrap_indices[:, i][leaf_values]
+
+                if sample_weight is not None:
+                    y_indices = y_indices[sample_weight[y_indices - 1] > 0]
+
+                # Subsample leaf training indices (without replacement).
+                if leaf_subsample and max_samples_leaf < len(y_indices):
+                    if not isinstance(y_indices, list):
+                        y_indices = list(y_indices)
+                    y_indices = random.sample(y_indices, max_samples_leaf)
+
+                y_train_leaves[i, leaf_idx, : len(y_indices)] = y_indices
+
+        return y_train_leaves
+
 
 # =============================================================================
 # Public estimators
diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index 6cfc33c5bdcea..d72f22f8b348d 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -11,6 +11,8 @@
 
 # See _criterion.pyx for implementation details.
 
+# from libcpp.vector cimport vector
+
 from ._tree cimport DTYPE_t          # Type of X
 from ._tree cimport DOUBLE_t         # Type of y, sample_weight
 from ._tree cimport SIZE_t           # Type for indices and counters
@@ -19,7 +21,7 @@ from ._tree cimport UINT32_t         # Unsigned 32 bit integer
 
 
 cdef class BaseCriterion:
-    """Abstract interface for criterion."""    
+    """Abstract interface for criterion."""
 
     # Internal structures
     cdef const DOUBLE_t[:] sample_weight  # Sample weights
@@ -70,13 +72,18 @@ cdef class BaseCriterion:
         SIZE_t end
     ) noexcept nogil
 
+    # cdef void node_samples(
+    #     self,
+    #     vector[vector[DOUBLE_t]]* dest
+    # ) noexcept nogil
+
 cdef class Criterion(BaseCriterion):
     """Abstract interface for supervised impurity criteria."""
 
     cdef const DOUBLE_t[:, ::1] y         # Values of y
     cdef SIZE_t n_missing                # Number of missing values for the feature being evaluated
     cdef bint missing_go_to_left         # Whether missing values go to the left node
-    
+
     cdef int init(
         self,
         const DOUBLE_t[:, ::1] y,
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 8fbcafcaf1456..e9c02ab2fa43d 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -39,10 +39,13 @@ cdef class BaseCriterion:
     covariates, or labels, or both. Although scikit-learn currently only contains
     supervised tree methods, this class enables 3rd party packages to leverage
     scikit-learn's Cython code for criteria.
+
     The downstream classes _must_ implement methods to compute the impurity
     in current node and in children nodes.
+
     This object stores methods on how to calculate how good a split is using
     a set API. 
+
     Samples in the "current" node are stored in `samples[start:end]` which is
     partitioned around `pos` (an index in `start:end`) so that:
        - the samples of left child node are stored in `samples[start:pos]`
@@ -56,21 +59,25 @@ cdef class BaseCriterion:
 
     cdef int reset(self) except -1 nogil:
         """Reset the criterion at pos=start.
+
         This method must be implemented by the subclass.
         """
         pass
 
     cdef int reverse_reset(self) except -1 nogil:
         """Reset the criterion at pos=end.
+
         This method must be implemented by the subclass.
         """
         pass
 
     cdef int update(self, SIZE_t new_pos) except -1 nogil:
         """Updated statistics by moving sample_indices[pos:new_pos] to the left child.
+
         This updates the collected statistics by moving sample_indices[pos:new_pos]
         from the right child to the left child. It must be implemented by
         the subclass.
+
         Parameters
         ----------
         new_pos : SIZE_t
@@ -80,6 +87,7 @@ cdef class BaseCriterion:
 
     cdef double node_impurity(self) noexcept nogil:
         """Placeholder for calculating the impurity of the node.
+
         Placeholder for a method which will evaluate the impurity of
         the current node, i.e. the impurity of sample_indices[start:end]. This is the
         primary function of the criterion class. The smaller the impurity the
@@ -90,9 +98,11 @@ cdef class BaseCriterion:
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Placeholder for calculating the impurity of children.
+
         Placeholder for a method which evaluates the impurity in
         children nodes, i.e. the impurity of sample_indices[start:pos] + the impurity
         of sample_indices[pos:end].
+
         Parameters
         ----------
         impurity_left : double pointer
@@ -106,8 +116,10 @@ cdef class BaseCriterion:
 
     cdef void node_value(self, double* dest) noexcept nogil:
         """Placeholder for storing the node value.
+
         Placeholder for a method which will compute the node value
         of sample_indices[start:end] and save the value into dest.
+
         Parameters
         ----------
         dest : double pointer
@@ -117,10 +129,12 @@ cdef class BaseCriterion:
 
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
+
         This method is used to speed up the search for the best split.
         It is a proxy quantity such that the split that maximizes this value
         also maximizes the impurity improvement. It neglects all constant terms
         of the impurity decrease for a given split.
+
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
         """
@@ -135,6 +149,7 @@ cdef class BaseCriterion:
                                      double impurity_left,
                                      double impurity_right) noexcept nogil:
         """Compute the improvement in impurity.
+
         This method computes the improvement in impurity when a split occurs.
         The weighted impurity improvement equation is the following:
             N_t / N * (impurity - N_t_R / N_t * right_impurity
@@ -142,6 +157,7 @@ cdef class BaseCriterion:
         where N is the total number of samples, N_t is the number of samples
         at the current node, N_t_L is the number of samples in the left child,
         and N_t_R is the number of samples in the right child,
+
         Parameters
         ----------
         impurity_parent : double
@@ -150,6 +166,7 @@ cdef class BaseCriterion:
             The impurity of the left child
         impurity_right : double
             The impurity of the right child
+
         Return
         ------
         double : improvement in impurity after the split occurs
@@ -166,10 +183,12 @@ cdef class BaseCriterion:
         SIZE_t end
     ) noexcept nogil:
         """Abstract method which will set sample pointers in the criterion.
+
         The dataset array that we compute criteria on is assumed to consist of 'N' 
         ordered samples or rows (i.e. sorted). Since we pass this by reference, we 
         use sample pointers to move the start and end around to consider only a subset of data. 
         This function should also update relevant statistics that the class uses to compute the final criterion.
+
         Parameters
         ----------
         start : SIZE_t
@@ -182,11 +201,13 @@ cdef class BaseCriterion:
 
 cdef class Criterion(BaseCriterion):
     """Interface for impurity criteria.
+
     The supervised criterion computes the impurity of a node and the reduction of
     impurity of a split on that node using the distribution of labels in parent and
-    children nodes. It also computes the output statistics
-    such as the mean in regression and class probabilities in classification.
-    Instances of this class are responsible for compute splits' impurity difference
+    children nodes. It also computes the output statistics such as the mean in regression
+    and class probabilities in classification. Instances of this class are responsible
+    for compute splits' impurity difference.
+
     Criterion is the base class for criteria used in supervised tree-based models
     with a homogeneous float64-dtyped y.
     """
@@ -198,8 +219,10 @@ cdef class Criterion(BaseCriterion):
         const SIZE_t[:] sample_indices
     ) except -1 nogil:
         """Placeholder for a method which will initialize the criterion.
+
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
+
         Parameters
         ----------
         y : ndarray, dtype=DOUBLE_t
@@ -279,6 +302,7 @@ cdef class ClassificationCriterion(Criterion):
     def __cinit__(self, SIZE_t n_outputs,
                   cnp.ndarray[SIZE_t, ndim=1] n_classes):
         """Initialize attributes for this criterion.
+
         Parameters
         ----------
         n_outputs : SIZE_t
@@ -331,8 +355,10 @@ cdef class ClassificationCriterion(Criterion):
         const SIZE_t[:] sample_indices
     ) except -1 nogil:
         """Initialize the criterion.
+
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
+
         Parameters
         ----------
         y : ndarray, dtype=DOUBLE_t
@@ -426,6 +452,7 @@ cdef class ClassificationCriterion(Criterion):
 
     cdef int reset(self) except -1 nogil:
         """Reset the criterion at pos=start.
+
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -442,6 +469,7 @@ cdef class ClassificationCriterion(Criterion):
 
     cdef int reverse_reset(self) except -1 nogil:
         """Reset the criterion at pos=end.
+
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -458,8 +486,10 @@ cdef class ClassificationCriterion(Criterion):
 
     cdef int update(self, SIZE_t new_pos) except -1 nogil:
         """Updated statistics by moving sample_indices[pos:new_pos] to the left child.
+
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
+
         Parameters
         ----------
         new_pos : SIZE_t
@@ -532,6 +562,7 @@ cdef class ClassificationCriterion(Criterion):
 
     cdef void node_value(self, double* dest) noexcept nogil:
         """Compute the node value of sample_indices[start:end] and save it into dest.
+
         Parameters
         ----------
         dest : double pointer
@@ -546,17 +577,20 @@ cdef class ClassificationCriterion(Criterion):
 
 cdef class Entropy(ClassificationCriterion):
     r"""Cross Entropy impurity criterion.
+
     This handles cases where the target is a classification taking values
     0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations,
     then let
         count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k)
     be the proportion of class k observations in node m.
+
     The cross-entropy is then defined as
         cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k)
     """
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
+
         Evaluate the cross-entropy criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -578,8 +612,10 @@ cdef class Entropy(ClassificationCriterion):
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
+
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity the right child (sample_indices[pos:end]).
+
         Parameters
         ----------
         impurity_left : double pointer
@@ -611,11 +647,13 @@ cdef class Entropy(ClassificationCriterion):
 
 cdef class Gini(ClassificationCriterion):
     r"""Gini Index impurity criterion.
+
     This handles cases where the target is a classification taking values
     0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations,
     then let
         count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k)
     be the proportion of class k observations in node m.
+
     The Gini Index is then defined as:
         index = \sum_{k=0}^{K-1} count_k (1 - count_k)
               = 1 - \sum_{k=0}^{K-1} count_k ** 2
@@ -623,6 +661,7 @@ cdef class Gini(ClassificationCriterion):
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
+
         Evaluate the Gini criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -648,8 +687,10 @@ cdef class Gini(ClassificationCriterion):
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
+
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity the right child (sample_indices[pos:end]) using the Gini index.
+
         Parameters
         ----------
         impurity_left : double pointer
@@ -726,6 +767,7 @@ cdef inline void _move_sums_regression(
 
 cdef class RegressionCriterion(Criterion):
     r"""Abstract regression criterion.
+
     This handles cases where the target is a continuous value, and is
     evaluated by computing the variance of the target values left and right
     of the split point. The computation takes linear time with `n_samples`
@@ -736,6 +778,7 @@ cdef class RegressionCriterion(Criterion):
 
     def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples):
         """Initialize parameters for this criterion.
+
         Parameters
         ----------
         n_outputs : SIZE_t
@@ -961,6 +1004,7 @@ cdef class MSE(RegressionCriterion):
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
+
         Evaluate the MSE criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -976,10 +1020,12 @@ cdef class MSE(RegressionCriterion):
 
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
+
         This method is used to speed up the search for the best split.
         It is a proxy quantity such that the split that maximizes this value
         also maximizes the impurity improvement. It neglects all constant terms
         of the impurity decrease for a given split.
+
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
         The MSE proxy is derived from
@@ -1002,6 +1048,7 @@ cdef class MSE(RegressionCriterion):
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
+
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity the right child (sample_indices[pos:end]).
         """
@@ -1045,6 +1092,7 @@ cdef class MSE(RegressionCriterion):
 
 cdef class MAE(RegressionCriterion):
     r"""Mean absolute error impurity criterion.
+
        MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true
        value and f_i is the predicted value."""
 
@@ -1056,6 +1104,7 @@ cdef class MAE(RegressionCriterion):
 
     def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples):
         """Initialize parameters for this criterion.
+
         Parameters
         ----------
         n_outputs : SIZE_t
@@ -1154,6 +1203,7 @@ cdef class MAE(RegressionCriterion):
 
     cdef int reset(self) except -1 nogil:
         """Reset the criterion at pos=start.
+
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -1184,6 +1234,7 @@ cdef class MAE(RegressionCriterion):
 
     cdef int reverse_reset(self) except -1 nogil:
         """Reset the criterion at pos=end.
+
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -1211,6 +1262,7 @@ cdef class MAE(RegressionCriterion):
 
     cdef int update(self, SIZE_t new_pos) except -1 nogil:
         """Updated statistics by moving sample_indices[pos:new_pos] to the left.
+
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -1273,6 +1325,7 @@ cdef class MAE(RegressionCriterion):
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
+
         Evaluate the MAE criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -1297,6 +1350,7 @@ cdef class MAE(RegressionCriterion):
     cdef void children_impurity(self, double* p_impurity_left,
                                 double* p_impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
+
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity the right child (sample_indices[pos:end]).
         """
@@ -1343,6 +1397,7 @@ cdef class MAE(RegressionCriterion):
 
 cdef class FriedmanMSE(MSE):
     """Mean squared error impurity criterion with improvement score by Friedman.
+
     Uses the formula (35) in Friedman's original Gradient Boosting paper:
         diff = mean_left - mean_right
         improvement = n_left * n_right * diff^2 / (n_left + n_right)
@@ -1350,10 +1405,12 @@ cdef class FriedmanMSE(MSE):
 
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
+
         This method is used to speed up the search for the best split.
         It is a proxy quantity such that the split that maximizes this value
         also maximizes the impurity improvement. It neglects all constant terms
         of the impurity decrease for a given split.
+
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
         """
@@ -1394,6 +1451,7 @@ cdef class FriedmanMSE(MSE):
 
 cdef class Poisson(RegressionCriterion):
     """Half Poisson deviance as impurity criterion.
+
     Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true)
     Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)`
     at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the
@@ -1413,6 +1471,7 @@ cdef class Poisson(RegressionCriterion):
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
+
         Evaluate the Poisson criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -1422,10 +1481,12 @@ cdef class Poisson(RegressionCriterion):
 
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
+
         This method is used to speed up the search for the best split.
         It is a proxy quantity such that the split that maximizes this value
         also maximizes the impurity improvement. It neglects all constant terms
         of the impurity decrease for a given split.
+
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
         The Poisson proxy is derived from:
@@ -1463,6 +1524,7 @@ cdef class Poisson(RegressionCriterion):
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
+
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity of the right child (sample_indices[pos:end]) for Poisson.
         """

From 2105949178bf03660c13df1fd197abbbb57d826e Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 13 Jun 2023 15:22:15 -0400
Subject: [PATCH 12/39] Working..

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_criterion.pxd | 2 +-
 sklearn/tree/_criterion.pyx | 4 +++-
 sklearn/tree/_splitter.pxd  | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index d72f22f8b348d..20020b4a5361c 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -11,7 +11,7 @@
 
 # See _criterion.pyx for implementation details.
 
-# from libcpp.vector cimport vector
+from libcpp.vector cimport vector
 
 from ._tree cimport DTYPE_t          # Type of X
 from ._tree cimport DOUBLE_t         # Type of y, sample_weight
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index e9c02ab2fa43d..d60cab3063c1b 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -34,7 +34,9 @@ from ._utils cimport WeightedMedianCalculator
 cdef double EPSILON = 10 * np.finfo('double').eps
 
 cdef class BaseCriterion:
-    """This is an abstract interface for criterion. For example, a tree model could
+    """This is an abstract interface for criterion.
+
+    For example, a tree model could
     be either supervisedly, or unsupervisedly computing impurity on samples of
     covariates, or labels, or both. Although scikit-learn currently only contains
     supervised tree methods, this class enables 3rd party packages to leverage
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 01975df22ef23..fc49471569ecc 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -97,7 +97,7 @@ cdef class BaseSplitter:
 cdef class Splitter(BaseSplitter):
     cdef public Criterion criterion      # Impurity criterion
     cdef const DOUBLE_t[:, ::1] y
-    
+
     cdef int init(
         self,
         object X,

From 9b07f2ab2b1b6f8f4ea1294fce1a5f9bd3be1a1d Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 13 Jun 2023 15:42:37 -0400
Subject: [PATCH 13/39] Add leaf storage ability

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py    | 67 ----------------------------
 sklearn/tree/_criterion.pxd |  9 ++--
 sklearn/tree/_criterion.pyx | 28 +++++++++---
 sklearn/tree/_splitter.pxd  |  3 ++
 sklearn/tree/_splitter.pyx  | 31 ++++++-------
 sklearn/tree/_tree.pxd      | 19 ++++++--
 sklearn/tree/_tree.pyx      | 88 +++++++++++++++++++++++++------------
 7 files changed, 122 insertions(+), 123 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 795c68c8b5081..4fdd8f27cd652 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -713,73 +713,6 @@ def feature_importances_(self):
 
         return self.tree_.compute_feature_importances()
 
-    def _get_y_for_leaves(self, X, sample_weight=None):
-        n_samples = X.shape[0]
-
-        # get the predictions
-        X_leaves = self.apply(X)
-
-        bootstrap_indices = np.empty(shape, dtype=np.int64)
-        for i, estimator in enumerate(self.estimators_):
-            # Get bootstrap indices.
-            if self.bootstrap:
-                n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples)
-                bootstrap_indices[:, i] = _generate_sample_indices(
-                    estimator.random_state, n_samples, n_samples_bootstrap
-                )
-            else:
-                bootstrap_indices[:, i] = np.arange(n_samples)
-
-            # Get predictions on bootstrap indices.
-            X_leaves[:, i] = X_leaves[bootstrap_indices[:, i], i]
-
-        if sorter is not None:
-            # Reassign bootstrap indices to account for target sorting.
-            bootstrap_indices = np.argsort(sorter)[bootstrap_indices]
-
-        bootstrap_indices += 1  # for sparse matrix (0s as empty)
-
-        # Get the maximum number of nodes (internal + leaves) across trees.
-        # Get the maximum number of samples per leaf across trees (if needed).
-        max_node_count = 0
-        max_samples_leaf = 0 if not leaf_subsample else max_samples_leaf
-        for i, estimator in enumerate(self.estimators_):
-            node_count = estimator.tree_.node_count
-            if node_count > max_node_count:
-                max_node_count = node_count
-            if not leaf_subsample:
-                sample_count = np.max(np.bincount(X_leaves[:, i]))
-                if sample_count > max_samples_leaf:
-                    max_samples_leaf = sample_count
-
-        # Initialize NumPy array (more efficient serialization than dict/list).
-        shape = (self.n_estimators, max_node_count, max_samples_leaf)
-        y_train_leaves = np.zeros(shape, dtype=np.int64)
-
-        for i, estimator in enumerate(self.estimators_):
-            # Group training indices by leaf node.
-            leaf_indices, leaf_values_list = _group_by_value(X_leaves[:, i])
-
-            if leaf_subsample:
-                random.seed(estimator.random_state)
-
-            # Map each leaf node to its list of training indices.
-            for leaf_idx, leaf_values in zip(leaf_indices, leaf_values_list):
-                y_indices = bootstrap_indices[:, i][leaf_values]
-
-                if sample_weight is not None:
-                    y_indices = y_indices[sample_weight[y_indices - 1] > 0]
-
-                # Subsample leaf training indices (without replacement).
-                if leaf_subsample and max_samples_leaf < len(y_indices):
-                    if not isinstance(y_indices, list):
-                        y_indices = list(y_indices)
-                    y_indices = random.sample(y_indices, max_samples_leaf)
-
-                y_train_leaves[i, leaf_idx, : len(y_indices)] = y_indices
-
-        return y_train_leaves
-
 
 # =============================================================================
 # Public estimators
diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index 20020b4a5361c..721b475f40436 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -72,10 +72,6 @@ cdef class BaseCriterion:
         SIZE_t end
     ) noexcept nogil
 
-    # cdef void node_samples(
-    #     self,
-    #     vector[vector[DOUBLE_t]]* dest
-    # ) noexcept nogil
 
 cdef class Criterion(BaseCriterion):
     """Abstract interface for supervised impurity criteria."""
@@ -94,6 +90,11 @@ cdef class Criterion(BaseCriterion):
     cdef void init_sum_missing(self)
     cdef void init_missing(self, SIZE_t n_missing) noexcept nogil
 
+    cdef void node_samples(
+        self,
+        vector[vector[DOUBLE_t]]* dest
+    ) noexcept nogil
+
 cdef class ClassificationCriterion(Criterion):
     """Abstract criterion for classification."""
 
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index d60cab3063c1b..c3f08ec859bee 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -46,7 +46,7 @@ cdef class BaseCriterion:
     in current node and in children nodes.
 
     This object stores methods on how to calculate how good a split is using
-    a set API. 
+    a set API.
 
     Samples in the "current" node are stored in `samples[start:end]` which is
     partitioned around `pos` (an index in `start:end`) so that:
@@ -186,9 +186,9 @@ cdef class BaseCriterion:
     ) noexcept nogil:
         """Abstract method which will set sample pointers in the criterion.
 
-        The dataset array that we compute criteria on is assumed to consist of 'N' 
-        ordered samples or rows (i.e. sorted). Since we pass this by reference, we 
-        use sample pointers to move the start and end around to consider only a subset of data. 
+        The dataset array that we compute criteria on is assumed to consist of 'N'
+        ordered samples or rows (i.e. sorted). Since we pass this by reference, we
+        use sample pointers to move the start and end around to consider only a subset of data.
         This function should also update relevant statistics that the class uses to compute the final criterion.
 
         Parameters
@@ -252,10 +252,28 @@ cdef class Criterion(BaseCriterion):
             Number of missing values for specific feature.
         """
         pass
-      
+
     cdef void init_sum_missing(self):
         """Init sum_missing to hold sums for missing values."""
 
+    cdef void node_samples(
+        self,
+        vector[vector[DOUBLE_t]]* dest
+    ) noexcept nogil:
+        cdef SIZE_t i, j
+
+        # Resize the destination vector of vectors
+        dest.resize(self.n_node_samples)
+
+        # Loop over the samples
+        for i in range(self.n_node_samples):
+            # Get the index of the current sample
+            j = self.sample_indices[self.start + i]
+
+            # Get the sample values for each output
+            for k in range(self.n_outputs):
+                dest[i][k].push_back(self.y[j, k])
+
 cdef inline void _move_sums_classification(
     ClassificationCriterion criterion,
     double[:, ::1] sum_1,
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index fc49471569ecc..fb21f676e66cc 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -10,6 +10,7 @@
 # License: BSD 3 clause
 
 # See _splitter.pyx for details.
+from libcpp.vector cimport vector
 
 from ._criterion cimport BaseCriterion, Criterion
 
@@ -106,6 +107,8 @@ cdef class Splitter(BaseSplitter):
         const unsigned char[::1] feature_has_missing,
     ) except -1
 
+    cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil
+
     # Methods that allow modifications to stopping conditions
     cdef bint check_presplit_conditions(
         self,
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index a58514d093ddf..7f21d5da545fb 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -53,12 +53,12 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) noexcept nogil
     self.n_missing = 0
 
 cdef class BaseSplitter:
-    """This is an abstract interface for splitters. 
+    """This is an abstract interface for splitters.
 
     For example, a tree model could be either supervisedly, or unsupervisedly computing splits on samples of
     covariates, labels, or both. Although scikit-learn currently only contains
     supervised tree methods, this class enables 3rd party packages to leverage
-    scikit-learn's Cython code for splitting. 
+    scikit-learn's Cython code for splitting.
 
     A splitter is usually used in conjunction with a criterion class, which explicitly handles
     computing the criteria, which we split on. The setting of that criterion class is handled
@@ -112,7 +112,7 @@ cdef class BaseSplitter:
 
     cdef int pointer_size(self) noexcept nogil:
         """Size of the pointer for split records.
-        
+
         Overriding this function allows one to use different subclasses of
         `SplitRecord`.
         """
@@ -156,7 +156,6 @@ cdef class Splitter(BaseSplitter):
         self.min_weight_leaf = min_weight_leaf
         self.random_state = random_state
 
-
     def __reduce__(self):
         return (type(self), (self.criterion,
                              self.max_features,
@@ -281,6 +280,10 @@ cdef class Splitter(BaseSplitter):
 
         self.criterion.node_value(dest)
 
+    cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil:
+        """Copy the samples[start:end] into dest."""
+        self.criterion.node_samples(dest)
+
     cdef double node_impurity(self) noexcept nogil:
         """Return the impurity of the current node."""
 
@@ -293,7 +296,7 @@ cdef class Splitter(BaseSplitter):
         bint missing_go_to_left,
     ) noexcept nogil:
         """Check stopping conditions pre-split.
-        
+
         This is typically a metric that is cheaply computed given the
         current proposed split, which is stored as a the `current_split`
         argument.
@@ -301,7 +304,7 @@ cdef class Splitter(BaseSplitter):
         cdef SIZE_t min_samples_leaf = self.min_samples_leaf
         cdef SIZE_t end_non_missing = self.end - n_missing
         cdef SIZE_t n_left, n_right
-        
+
         if missing_go_to_left:
             n_left = current_split.pos - self.start + n_missing
             n_right = end_non_missing - current_split.pos
@@ -312,14 +315,14 @@ cdef class Splitter(BaseSplitter):
         # Reject if min_samples_leaf is not guaranteed
         if n_left < min_samples_leaf or n_right < min_samples_leaf:
             return 1
-        
+
         return 0
 
     cdef bint check_postsplit_conditions(
         self
     ) noexcept nogil:
         """Check stopping conditions after evaluating the split.
-        
+
         This takes some metric that is stored in the Criterion
         object and checks against internal stop metrics.
         """
@@ -329,10 +332,10 @@ cdef class Splitter(BaseSplitter):
         if ((self.criterion.weighted_n_left < min_weight_leaf) or
                 (self.criterion.weighted_n_right < min_weight_leaf)):
             return 1
-        
+
         return 0
 
-      
+
 cdef inline void shift_missing_values_to_left_if_required(
     SplitRecord* best,
     SIZE_t[::1] samples,
@@ -360,7 +363,7 @@ cdef inline void shift_missing_values_to_left_if_required(
 ctypedef fused Partitioner:
     DensePartitioner
     SparsePartitioner
-    
+
 cdef inline int node_split_best(
     Splitter splitter,
     Partitioner partitioner,
@@ -504,9 +507,9 @@ cdef inline int node_split_best(
 
                 if p >= end_non_missing:
                     continue
-                    
+
                 current_split.pos = p
-                
+
                 # Reject if min_samples_leaf is not guaranteed
                 if splitter.check_presplit_conditions(current_split, n_missing, missing_go_to_left) == 1:
                     continue
@@ -740,8 +743,6 @@ cdef inline int node_split_random(
     cdef SIZE_t n_features = splitter.n_features
 
     cdef SIZE_t max_features = splitter.max_features
-    cdef SIZE_t min_samples_leaf = splitter.min_samples_leaf
-    cdef double min_weight_leaf = splitter.min_weight_leaf
     cdef UINT32_t* random_state = &splitter.rand_r_state
 
     cdef SplitRecord best_split, current_split
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index cbe85886cd865..94714cc33400c 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -14,6 +14,7 @@ import numpy as np
 cimport numpy as cnp
 
 from libcpp.vector cimport vector
+from libcpp.unordered_map cimport unordered_map
 
 ctypedef cnp.npy_float32 DTYPE_t          # Type of X
 ctypedef cnp.npy_float64 DOUBLE_t         # Type of y, sample_weight
@@ -36,6 +37,7 @@ cdef struct Node:
     DOUBLE_t weighted_n_node_samples     # Weighted number of samples at the node
     unsigned char missing_go_to_left     # Whether features have missing values
 
+
 cdef class BaseTree:
     # Inner structures: values are stored separately from node structure,
     # since size is determined at runtime.
@@ -45,7 +47,14 @@ cdef class BaseTree:
     cdef Node* nodes                     # Array of nodes
 
     cdef SIZE_t value_stride             # The dimensionality of a vectorized output per sample
-    cdef double* value                   # Array of values prediction values for each node        
+    cdef double* value                   # Array of values prediction values for each node
+
+    # Enables the use of tree to store distributions of the output to allow
+    # arbitrary usage of the the leaves. This is used in the quantile
+    # estimators for example.
+    # for storing samples at each leaf node with leaf's node ID as the key and
+    # the sample values as the value
+    cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples
 
     # Generic Methods: These are generic methods used by any tree.
     cdef int _resize(self, SIZE_t capacity) except -1 nogil
@@ -61,7 +70,7 @@ cdef class BaseTree:
         double weighted_n_node_samples,
         unsigned char missing_go_to_left
     ) except -1 nogil
-    
+
     # Python API methods: These are methods exposed to Python
     cpdef cnp.ndarray apply(self, object X)
     cdef cnp.ndarray _apply_dense(self, object X)
@@ -101,10 +110,10 @@ cdef class Tree(BaseTree):
     # The Supervised Tree object is a binary tree structure constructed by the
     # TreeBuilder. The tree structure is used for predictions and
     # feature importances.
-    # 
+    #
     # Value of upstream properties:
     # - value_stride = n_outputs * max_n_classes
-    # - value = (capacity, n_outputs, max_n_classes) array of values          
+    # - value = (capacity, n_outputs, max_n_classes) array of values
 
     # Input/Output layout for supervised tree
     cdef public SIZE_t n_features        # Number of features in X
@@ -137,6 +146,8 @@ cdef class TreeBuilder:
     cdef SIZE_t max_depth               # Maximal tree depth
     cdef double min_impurity_decrease   # Impurity threshold for early stopping
 
+    cdef unsigned char store_leaf_values # Whether to store leaf values
+
     cpdef build(
         self,
         Tree tree,
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 2256b28c7df10..8ca98a64b42ab 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -158,15 +158,23 @@ cdef struct StackRecord:
 cdef class DepthFirstTreeBuilder(TreeBuilder):
     """Build a decision tree in depth-first fashion."""
 
-    def __cinit__(self, Splitter splitter, SIZE_t min_samples_split,
-                  SIZE_t min_samples_leaf, double min_weight_leaf,
-                  SIZE_t max_depth, double min_impurity_decrease):
+    def __cinit__(
+        self,
+        Splitter splitter,
+        SIZE_t min_samples_split,
+        SIZE_t min_samples_leaf,
+        double min_weight_leaf,
+        SIZE_t max_depth,
+        double min_impurity_decrease,
+        unsigned char store_leaf_values=False
+    ):
         self.splitter = splitter
         self.min_samples_split = min_samples_split
         self.min_samples_leaf = min_samples_leaf
         self.min_weight_leaf = min_weight_leaf
         self.max_depth = max_depth
         self.min_impurity_decrease = min_impurity_decrease
+        self.store_leaf_values = store_leaf_values
 
     cpdef build(
         self,
@@ -221,6 +229,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef SIZE_t max_depth_seen = -1
         cdef int rc = 0
 
+        cdef int node_idx
+
         cdef stack[StackRecord] builder_stack
         cdef StackRecord stack_record
 
@@ -308,6 +318,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         "is_left": 1,
                         "impurity": split.impurity_left,
                         "n_constant_features": n_constant_features})
+                elif self.store_leaf_values and is_leaf:
+                    with gil:
+                        print('Storing leaf values...')
+
+                    # copy leaf values to leaf_values array
+                    splitter.node_samples(&tree.value_samples[node_id])
 
                 if depth > max_depth_seen:
                     max_depth_seen = depth
@@ -317,7 +333,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
 
             if rc >= 0:
                 tree.max_depth = max_depth_seen
-        
+
         # free the memory created for the SplitRecord pointer
         free(split_ptr)
 
@@ -364,10 +380,17 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
     """
     cdef SIZE_t max_leaf_nodes
 
-    def __cinit__(self, Splitter splitter, SIZE_t min_samples_split,
-                  SIZE_t min_samples_leaf,  min_weight_leaf,
-                  SIZE_t max_depth, SIZE_t max_leaf_nodes,
-                  double min_impurity_decrease):
+    def __cinit__(
+        self,
+        Splitter splitter,
+        SIZE_t min_samples_split,
+        SIZE_t min_samples_leaf,
+        double min_weight_leaf,
+        SIZE_t max_depth,
+        SIZE_t max_leaf_nodes,
+        double min_impurity_decrease,
+        unsigned char store_leaf_values=False,
+    ):
         self.splitter = splitter
         self.min_samples_split = min_samples_split
         self.min_samples_leaf = min_samples_leaf
@@ -375,6 +398,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         self.max_depth = max_depth
         self.max_leaf_nodes = max_leaf_nodes
         self.min_impurity_decrease = min_impurity_decrease
+        self.store_leaf_values = store_leaf_values
 
     cpdef build(
         self,
@@ -488,7 +512,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         """Adds node w/ partition ``[start, end)`` to the frontier. """
         cdef SplitRecord split
         cdef SplitRecord* split_ptr = <SplitRecord *>malloc(splitter.pointer_size())
-        
+
         cdef SIZE_t node_id
         cdef SIZE_t n_node_samples
         cdef SIZE_t n_constant_features = 0
@@ -553,7 +577,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
             res.improvement = 0.0
             res.impurity_left = impurity
             res.impurity_right = impurity
-        
+
         free(split_ptr)
         return 0
 
@@ -564,7 +588,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
 
 cdef class BaseTree:
     """Base class for Cython tree models.
-    
+
     Downstream classes must implement
     """
     cdef int _resize(
@@ -622,7 +646,7 @@ cdef class BaseTree:
         Node* node
     ) except -1 nogil:
         """Set split node data.
-        
+
         Parameters
         ----------
         split_node : SplitRecord*
@@ -641,7 +665,7 @@ cdef class BaseTree:
         Node* node
     ) except -1 nogil:
         """Set leaf node data.
-        
+
         Parameters
         ----------
         split_node : SplitRecord*
@@ -655,9 +679,12 @@ cdef class BaseTree:
         node.threshold = _TREE_UNDEFINED
         return 1
 
-    cdef DTYPE_t _compute_feature(self, const DTYPE_t[:, :] X_ndarray,
-            SIZE_t sample_index,
-            Node *node) noexcept nogil:
+    cdef DTYPE_t _compute_feature(
+        self,
+        const DTYPE_t[:, :] X_ndarray,
+        SIZE_t sample_index,
+        Node *node
+    ) noexcept nogil:
         """Compute feature from a given data matrix, X.
 
         In axis-aligned trees, this is simply the value in the column of X
@@ -668,7 +695,7 @@ cdef class BaseTree:
         return feature
 
     cdef SIZE_t _add_node(
-        self, 
+        self,
         SIZE_t parent,
         bint is_left,
         bint is_leaf,
@@ -679,7 +706,9 @@ cdef class BaseTree:
         unsigned char missing_go_to_left
     ) except -1 nogil:
         """Add a node to the tree.
+
         The new node registers itself as the child of its parent.
+
         Parameters
         ----------
         parent : SIZE_t
@@ -697,7 +726,7 @@ cdef class BaseTree:
             The number of samples in the node.
         weighted_n_node_samples : double
             The weight of the samples in the node.
-            
+
         Returns (size_t)(-1) on error.
         """
         cdef SIZE_t node_id = self.node_count
@@ -719,12 +748,12 @@ cdef class BaseTree:
 
         if is_leaf:
             if self._set_leaf_node(split_node, node) != 1:
-                 with gil:
-                     raise RuntimeError
+                with gil:
+                    raise RuntimeError
         else:
             if self._set_split_node(split_node, node) != 1:
-                 with gil:
-                     raise RuntimeError
+                with gil:
+                    raise RuntimeError
             node.missing_go_to_left = missing_go_to_left
 
         self.node_count += 1
@@ -796,8 +825,8 @@ cdef class BaseTree:
 
         # Extract input
         cdef const DTYPE_t[:] X_data = X.data
-        cdef const INT32_t[:] X_indices  = X.indices
-        cdef const INT32_t[:] X_indptr  = X.indptr
+        cdef const INT32_t[:] X_indices = X.indices
+        cdef const INT32_t[:] X_indptr = X.indptr
 
         cdef SIZE_t n_samples = X.shape[0]
         cdef SIZE_t n_features = X.shape[1]
@@ -928,8 +957,8 @@ cdef class BaseTree:
 
         # Extract input
         cdef const DTYPE_t[:] X_data = X.data
-        cdef const INT32_t[:] X_indices  = X.indices
-        cdef const INT32_t[:] X_indptr  = X.indptr
+        cdef const INT32_t[:] X_indices = X.indices
+        cdef const INT32_t[:] X_indptr = X.indptr
 
         cdef SIZE_t n_samples = X.shape[0]
         cdef SIZE_t n_features = X.shape[1]
@@ -1043,7 +1072,7 @@ cdef class BaseTree:
                     # ... and node.right_child != _TREE_LEAF:
                     self._compute_feature_importances(
                         importances, node)
-                        
+
                 node += 1
 
         for i in range(self.n_features):
@@ -1065,7 +1094,7 @@ cdef class BaseTree:
         Node* node
     ) noexcept nogil:
         """Compute feature importances from a Node in the Tree.
-        
+
         Wrapped in a private function to allow subclassing that
         computes feature importances.
         """
@@ -1321,6 +1350,9 @@ cdef class Tree(BaseTree):
         self.value = NULL
         self.nodes = NULL
 
+        # initialize the hash map for the value samples
+        self.value_samples = unordered_map[SIZE_t, vector[vector[DOUBLE_t]]]()
+
     def __dealloc__(self):
         """Destructor."""
         # Free all inner structures

From 21ccb30478bdff652118af59a4cd614a23f799d0 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 15 Jun 2023 10:35:44 -0400
Subject: [PATCH 14/39] [ENH] Adding leaf node samples to be stored when
 "quantile" tree is turned on (#45)

#### Reference Issues/PRs
Addresses the quantile-trees part of:
https://github.com/neurodata/scikit-tree/issues/29


#### What does this implement/fix? Explain your changes.
1. Stores for each leaf node a 2D numpy array of the y-samples (remember
`y` is (n_samples, n_outputs))
2. Does this all the way in Criterion
3. Only supports supervised tree/splitter/criterion
4. merges in `main` changes.

#### Any other comments?


<!--
Please be aware that we are a loose team of volunteers so patience is
necessary; assistance handling other issues is very welcome. We value
all user contributions, no matter how minor they are. If we are slow to
review, either the pull request needs some benchmarking, tinkering,
convincing, etc. or more likely the reviewers are simply busy. In either
case, we ask for your understanding during the review process.
For more information, see our FAQ on this topic:

http://scikit-learn.org/dev/faq.html#why-is-my-pull-request-not-getting-any-attention.

Thanks for contributing!
-->

---------

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 doc/authors_emeritus.rst                      |   1 +
 doc/contributor_experience_team.rst           |  12 +-
 doc/modules/classes.rst                       |   1 +
 doc/modules/learning_curve.rst                |  42 +-
 doc/visualizations.rst                        |   1 +
 doc/whats_new/v1.3.rst                        | 286 +++++---
 .../plot_kernel_ridge_regression.py           |   1 +
 .../model_selection/plot_validation_curve.py  |  46 +-
 sklearn/base.py                               |  10 +-
 sklearn/calibration.py                        |   7 +-
 sklearn/cluster/_affinity_propagation.py      |   4 +-
 sklearn/cluster/_agglomerative.py             |   5 +-
 sklearn/cluster/_bicluster.py                 |   4 +-
 sklearn/cluster/_birch.py                     |   8 +-
 sklearn/cluster/_bisect_k_means.py            |   4 +-
 sklearn/cluster/_dbscan.py                    |   7 +-
 sklearn/cluster/_feature_agglomeration.py     |  34 +-
 sklearn/cluster/_kmeans.py                    |  11 +-
 sklearn/cluster/_mean_shift.py                |   3 +-
 sklearn/cluster/_optics.py                    |   7 +-
 sklearn/cluster/_spectral.py                  |   4 +-
 .../tests/test_feature_agglomeration.py       |  24 +
 sklearn/compose/_column_transformer.py        |   7 +-
 sklearn/compose/_target.py                    |   6 +-
 sklearn/covariance/_elliptic_envelope.py      |   3 +-
 sklearn/covariance/_empirical_covariance.py   |   3 +-
 sklearn/covariance/_graph_lasso.py            |   5 +-
 sklearn/covariance/_robust_covariance.py      |   3 +-
 sklearn/covariance/_shrunk_covariance.py      |   8 +-
 sklearn/cross_decomposition/_pls.py           |   7 +-
 sklearn/datasets/_arff_parser.py              |   5 +-
 sklearn/datasets/tests/test_openml.py         |   4 +-
 sklearn/decomposition/_dict_learning.py       |   7 +-
 sklearn/decomposition/_factor_analysis.py     |   4 +-
 sklearn/decomposition/_fastica.py             |   7 +-
 sklearn/decomposition/_incremental_pca.py     |   8 +-
 sklearn/decomposition/_kernel_pca.py          |   4 +-
 sklearn/decomposition/_lda.py                 |   7 +-
 sklearn/decomposition/_nmf.py                 |  44 +-
 sklearn/decomposition/_pca.py                 |   7 +-
 sklearn/decomposition/_sparse_pca.py          |   3 +-
 sklearn/decomposition/_truncated_svd.py       |   4 +-
 sklearn/decomposition/tests/test_nmf.py       |  27 +
 sklearn/discriminant_analysis.py              |   9 +-
 sklearn/dummy.py                              |   7 +-
 sklearn/ensemble/_bagging.py                  |   8 +-
 sklearn/ensemble/_forest.py                   | 174 ++++-
 sklearn/ensemble/_gb.py                       |   8 +-
 .../gradient_boosting.py                      |   4 +-
 sklearn/ensemble/_iforest.py                  |   3 +-
 sklearn/ensemble/_stacking.py                 |   8 +-
 sklearn/ensemble/_voting.py                   |  11 +-
 sklearn/ensemble/_weight_boosting.py          |   8 +-
 sklearn/ensemble/tests/test_forest.py         |  51 ++
 .../feature_extraction/_dict_vectorizer.py    |   5 +-
 sklearn/feature_extraction/_hash.py           |   4 +-
 sklearn/feature_extraction/image.py           |   3 +-
 sklearn/feature_extraction/text.py            |  14 +-
 sklearn/feature_selection/_from_model.py      |  11 +-
 sklearn/feature_selection/_rfe.py             |  11 +-
 sklearn/feature_selection/_sequential.py      |   7 +-
 .../_univariate_selection.py                  |   4 +-
 .../feature_selection/_variance_threshold.py  |   3 +-
 sklearn/gaussian_process/_gpc.py              |   4 +-
 sklearn/gaussian_process/_gpr.py              |   4 +-
 sklearn/impute/_base.py                       |   8 +-
 sklearn/impute/_iterative.py                  |   8 +-
 sklearn/impute/_knn.py                        |   3 +-
 sklearn/isotonic.py                           |   3 +-
 sklearn/kernel_approximation.py               |  13 +-
 sklearn/kernel_ridge.py                       |   4 +-
 sklearn/linear_model/_base.py                 |   5 +-
 sklearn/linear_model/_bayes.py                |   8 +-
 sklearn/linear_model/_coordinate_descent.py   |  11 +-
 sklearn/linear_model/_glm/glm.py              |   4 +-
 sklearn/linear_model/_huber.py                |   3 +-
 sklearn/linear_model/_least_angle.py          |  10 +-
 sklearn/linear_model/_logistic.py             |   9 +-
 sklearn/linear_model/_omp.py                  |   7 +-
 sklearn/linear_model/_passive_aggressive.py   |   9 +-
 sklearn/linear_model/_quantile.py             |   3 +-
 sklearn/linear_model/_ransac.py               |   7 +-
 sklearn/linear_model/_ridge.py                |  13 +-
 sklearn/linear_model/_stochastic_gradient.py  |  13 +-
 sklearn/linear_model/_theil_sen.py            |   3 +-
 sklearn/manifold/_isomap.py                   |  13 +-
 sklearn/manifold/_locally_linear.py           |   5 +-
 sklearn/manifold/_mds.py                      |   4 +-
 sklearn/manifold/_spectral_embedding.py       |   4 +-
 sklearn/manifold/_t_sne.py                    |  11 +-
 sklearn/metrics/pairwise.py                   |  14 +-
 sklearn/mixture/_base.py                      |   4 +-
 sklearn/model_selection/__init__.py           |   2 +
 sklearn/model_selection/_plot.py              | 680 +++++++++++++++---
 sklearn/model_selection/_search.py            |   6 +-
 .../_search_successive_halving.py             |   6 +-
 sklearn/model_selection/tests/test_plot.py    | 337 +++++++--
 sklearn/multiclass.py                         |  29 +-
 sklearn/multioutput.py                        |  26 +-
 sklearn/naive_bayes.py                        |  12 +-
 sklearn/neighbors/_classification.py          |  12 +-
 sklearn/neighbors/_graph.py                   |  11 +-
 sklearn/neighbors/_kde.py                     |   7 +-
 sklearn/neighbors/_lof.py                     |   7 +-
 sklearn/neighbors/_nca.py                     |   4 +-
 sklearn/neighbors/_nearest_centroid.py        |   4 +-
 sklearn/neighbors/_regression.py              |  12 +-
 sklearn/neighbors/_unsupervised.py            |   6 +-
 .../neural_network/_multilayer_perceptron.py  |  12 +-
 sklearn/neural_network/_rbm.py                |   9 +-
 sklearn/pipeline.py                           |  16 +-
 sklearn/preprocessing/_data.py                |  57 +-
 sklearn/preprocessing/_discretization.py      |   3 +-
 sklearn/preprocessing/_encoders.py            |   7 +-
 .../preprocessing/_function_transformer.py    |   3 +-
 sklearn/preprocessing/_label.py               |  10 +-
 sklearn/preprocessing/_polynomial.py          |   6 +-
 sklearn/preprocessing/_target_encoder.py      |   5 +-
 sklearn/preprocessing/tests/test_data.py      |  19 +
 sklearn/random_projection.py                  |   4 +-
 sklearn/semi_supervised/_label_propagation.py |   3 +-
 sklearn/semi_supervised/_self_training.py     |   7 +-
 sklearn/svm/_base.py                          |   4 +-
 sklearn/svm/_classes.py                       |   7 +-
 sklearn/tests/test_metadata_routing.py        |  15 +
 sklearn/tests/test_public_functions.py        |   1 +
 sklearn/tree/_classes.py                      | 185 ++++-
 sklearn/tree/_criterion.pxd                   |   2 +-
 sklearn/tree/_criterion.pyx                   |  15 +-
 sklearn/tree/_splitter.pxd                    |   4 +-
 sklearn/tree/_splitter.pyx                    |  46 +-
 sklearn/tree/_tree.pxd                        |  20 +-
 sklearn/tree/_tree.pyx                        |  58 +-
 sklearn/tree/tests/test_tree.py               | 175 ++++-
 sklearn/utils/_metadata_requests.py           |  12 +-
 sklearn/utils/_plotting.py                    |  40 ++
 sklearn/utils/estimator_checks.py             |  19 +-
 sklearn/utils/tests/test_param_validation.py  |   4 +-
 sklearn/utils/tests/test_plotting.py          |  63 ++
 sklearn/utils/tests/test_validation.py        |  10 +
 sklearn/utils/validation.py                   |  51 +-
 141 files changed, 2511 insertions(+), 797 deletions(-)
 create mode 100644 sklearn/utils/tests/test_plotting.py

diff --git a/doc/authors_emeritus.rst b/doc/authors_emeritus.rst
index b979b77bba974..a56e2bc408ff4 100644
--- a/doc/authors_emeritus.rst
+++ b/doc/authors_emeritus.rst
@@ -20,6 +20,7 @@
 - Wei Li
 - Paolo Losi
 - Gilles Louppe
+- Chiara Marmo
 - Vincent Michel
 - Jarrod Millman
 - Alexandre Passos
diff --git a/doc/contributor_experience_team.rst b/doc/contributor_experience_team.rst
index 2e09d9069849a..00b658632302e 100644
--- a/doc/contributor_experience_team.rst
+++ b/doc/contributor_experience_team.rst
@@ -18,6 +18,10 @@
     <p>Lucy Liu</p>
     </div>
     <div>
+    <a href='https://github.com/MaxwellLZH'><img src='https://avatars.githubusercontent.com/u/16646940?v=4' class='avatar' /></a> <br />
+    <p>Maxwell Liu</p>
+    </div>
+    <div>
     <a href='https://github.com/jmloyola'><img src='https://avatars.githubusercontent.com/u/2133361?v=4' class='avatar' /></a> <br />
     <p>Juan Martin Loyola</p>
     </div>
@@ -26,14 +30,6 @@
     <p>Sylvain Marié</p>
     </div>
     <div>
-    <a href='https://github.com/cmarmo'><img src='https://avatars.githubusercontent.com/u/1662261?v=4' class='avatar' /></a> <br />
-    <p>Chiara Marmo</p>
-    </div>
-    <div>
-    <a href='https://github.com/MaxwellLZH'><img src='https://avatars.githubusercontent.com/u/16646940?v=4' class='avatar' /></a> <br />
-    <p>Maxwell Liu</p>
-    </div>
-    <div>
     <a href='https://github.com/norbusan'><img src='https://avatars.githubusercontent.com/u/1735589?v=4' class='avatar' /></a> <br />
     <p>Norbert Preining</p>
     </div>
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 4961fb0fec366..204c300b1a9b8 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1247,6 +1247,7 @@ Visualization
    :template: display_only_from_estimator.rst
 
    model_selection.LearningCurveDisplay
+   model_selection.ValidationCurveDisplay
 
 .. _multiclass_ref:
 
diff --git a/doc/modules/learning_curve.rst b/doc/modules/learning_curve.rst
index 0ce64063d4cd9..3d458a1a67416 100644
--- a/doc/modules/learning_curve.rst
+++ b/doc/modules/learning_curve.rst
@@ -71,7 +71,7 @@ The function :func:`validation_curve` can help in this case::
   >>> import numpy as np
   >>> from sklearn.model_selection import validation_curve
   >>> from sklearn.datasets import load_iris
-  >>> from sklearn.linear_model import Ridge
+  >>> from sklearn.svm import SVC
 
   >>> np.random.seed(0)
   >>> X, y = load_iris(return_X_y=True)
@@ -80,30 +80,50 @@ The function :func:`validation_curve` can help in this case::
   >>> X, y = X[indices], y[indices]
 
   >>> train_scores, valid_scores = validation_curve(
-  ...     Ridge(), X, y, param_name="alpha", param_range=np.logspace(-7, 3, 3),
-  ...     cv=5)
+  ...     SVC(kernel="linear"), X, y, param_name="C", param_range=np.logspace(-7, 3, 3),
+  ... )
   >>> train_scores
-  array([[0.93..., 0.94..., 0.92..., 0.91..., 0.92...],
-         [0.93..., 0.94..., 0.92..., 0.91..., 0.92...],
-         [0.51..., 0.52..., 0.49..., 0.47..., 0.49...]])
+  array([[0.90..., 0.94..., 0.91..., 0.89..., 0.92...],
+         [0.9... , 0.92..., 0.93..., 0.92..., 0.93...],
+         [0.97..., 1...   , 0.98..., 0.97..., 0.99...]])
   >>> valid_scores
-  array([[0.90..., 0.84..., 0.94..., 0.96..., 0.93...],
-         [0.90..., 0.84..., 0.94..., 0.96..., 0.93...],
-         [0.46..., 0.25..., 0.50..., 0.49..., 0.52...]])
+  array([[0.9..., 0.9... , 0.9... , 0.96..., 0.9... ],
+         [0.9..., 0.83..., 0.96..., 0.96..., 0.93...],
+         [1.... , 0.93..., 1....  , 1....  , 0.9... ]])
+
+If you intend to plot the validation curves only, the class
+:class:`~sklearn.model_selection.ValidationCurveDisplay` is more direct than
+using matplotlib manually on the results of a call to :func:`validation_curve`.
+You can use the method
+:meth:`~sklearn.model_selection.ValidationCurveDisplay.from_estimator` similarly
+to :func:`validation_curve` to generate and plot the validation curve:
+
+.. plot::
+   :context: close-figs
+   :align: center
+
+      from sklearn.datasets import load_iris
+      from sklearn.model_selection import ValidationCurveDisplay
+      from sklearn.svm import SVC
+      from sklearn.utils import shuffle
+      X, y = load_iris(return_X_y=True)
+      X, y = shuffle(X, y, random_state=0)
+      ValidationCurveDisplay.from_estimator(
+         SVC(kernel="linear"), X, y, param_name="C", param_range=np.logspace(-7, 3, 10)
+      )
 
 If the training score and the validation score are both low, the estimator will
 be underfitting. If the training score is high and the validation score is low,
 the estimator is overfitting and otherwise it is working very well. A low
 training score and a high validation score is usually not possible. Underfitting,
 overfitting, and a working model are shown in the in the plot below where we vary
-the parameter :math:`\gamma` of an SVM on the digits dataset.
+the parameter `gamma` of an SVM with an RBF kernel on the digits dataset.
 
 .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_validation_curve_001.png
    :target: ../auto_examples/model_selection/plot_validation_curve.html
    :align: center
    :scale: 50%
 
-
 .. _learning_curve:
 
 Learning curve
diff --git a/doc/visualizations.rst b/doc/visualizations.rst
index f692fd8efd1df..9a44f6feb1b48 100644
--- a/doc/visualizations.rst
+++ b/doc/visualizations.rst
@@ -89,3 +89,4 @@ Display Objects
    metrics.PredictionErrorDisplay
    metrics.RocCurveDisplay
    model_selection.LearningCurveDisplay
+   model_selection.ValidationCurveDisplay
diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index bb35a1db224b4..41c03293cf067 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -29,11 +29,6 @@ random sampling procedures.
   `transform_algorithm` is not the same as `fit_algorithm` and the number of iterations
   is small. :pr:`24871` by :user:`Omar Salman <OmarManzoor>`.
 
-- |Fix| Treat more consistently small values in the `W` and `H` matrices during the
-  `fit` and `transform` steps of :class:`decomposition.NMF` and
-  :class:`decomposition.MiniBatchNMF` which can produce different results than previous
-  versions. :pr:`25438` by :user:`Yotam Avidar-Constantini <yotamcons>`.
-
 - |Enhancement| The `sample_weight` parameter now will be used in centroids
   initialization for :class:`cluster.KMeans`, :class:`cluster.BisectingKMeans`
   and :class:`cluster.MiniBatchKMeans`.
@@ -43,6 +38,11 @@ random sampling procedures.
   :user:`Jérémie du Boisberranger <jeremiedbb>`,
   :user:`Guillaume Lemaitre <glemaitre>`.
 
+- |Fix| Treat more consistently small values in the `W` and `H` matrices during the
+  `fit` and `transform` steps of :class:`decomposition.NMF` and
+  :class:`decomposition.MiniBatchNMF` which can produce different results than previous
+  versions. :pr:`25438` by :user:`Yotam Avidar-Constantini <yotamcons>`.
+
 - |Fix| :class:`decomposition.KernelPCA` may produce different results through
   `inverse_transform` if `gamma` is `None`. Now it will be chosen correctly as
   `1/n_features` of the data that it is fitted on, while previously it might be
@@ -51,6 +51,14 @@ random sampling procedures.
   used each time the kernel is called.
   :pr:`26337` by :user:`Yao Xiao <Charlie-XIAO>`.
 
+Changed displays
+----------------
+
+- |Enhancement| :class:`model_selection.LearningCurveDisplay` displays both the
+  train and test curves by default. You can set `score_type="test"` to keep the
+  past behaviour.
+  :pr:`25120` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Changes impacting all modules
 -----------------------------
 
@@ -201,23 +209,9 @@ Changelog
 :mod:`sklearn.cluster`
 ......................
 
-- |API| The `sample_weight` parameter in `predict` for
-  :meth:`cluster.KMeans.predict` and :meth:`cluster.MiniBatchKMeans.predict`
-  is now deprecated and will be removed in v1.5.
-  :pr:`25251` by :user:`Gleb Levitski <glevv>`.
-
-- |Enhancement| The `sample_weight` parameter now will be used in centroids
-  initialization for :class:`cluster.KMeans`, :class:`cluster.BisectingKMeans`
-  and :class:`cluster.MiniBatchKMeans`.
-  This change will break backward compatibility, since numbers generated
-  from same random seeds will be different.
-  :pr:`25752` by :user:`Gleb Levitski <glevv>`,
-  :user:`Jérémie du Boisberranger <jeremiedbb>`,
-  :user:`Guillaume Lemaitre <glemaitre>`.
-
 - |MajorFeature| Added :class:`cluster.HDBSCAN`, a modern hierarchical density-based
   clustering algorithm. Similarly to :class:`cluster.OPTICS`, it can be seen as a
-  generalization of :class:`DBSCAN` by allowing for hierarchical instead of flat
+  generalization of :class:`cluster.DBSCAN` by allowing for hierarchical instead of flat
   clustering, however it varies in its approach from :class:`cluster.OPTICS`. This
   algorithm is very robust with respect to its hyperparameters' values and can
   be used on a wide variety of data without much, if any, tuning.
@@ -228,12 +222,30 @@ Changelog
 
   :pr:`26385` by :user:`Meekail Zain <micky774>`
 
+- |Enhancement| The `sample_weight` parameter now will be used in centroids
+  initialization for :class:`cluster.KMeans`, :class:`cluster.BisectingKMeans`
+  and :class:`cluster.MiniBatchKMeans`.
+  This change will break backward compatibility, since numbers generated
+  from same random seeds will be different.
+  :pr:`25752` by :user:`Gleb Levitski <glevv>`,
+  :user:`Jérémie du Boisberranger <jeremiedbb>`,
+  :user:`Guillaume Lemaitre <glemaitre>`.
+
+- |API| The `sample_weight` parameter in `predict` for
+  :meth:`cluster.KMeans.predict` and :meth:`cluster.MiniBatchKMeans.predict`
+  is now deprecated and will be removed in v1.5.
+  :pr:`25251` by :user:`Gleb Levitski <glevv>`.
+
+- |API| The `Xred` argument in :func:`cluster.FeatureAgglomeration.inverse_transform`
+  is renamed to `Xt` and will be removed in v1.5. :pr:`26503` by `Adrin Jalali`_.
+
 :mod:`sklearn.compose`
 ......................
 
-- |Fix| `compose.ColumnTransformer` raises an informative error when the individual transformers of `ColumnTransformer`
-  output pandas dataframes with indexes that are not consistent with each other and the output is configured
-  to be pandas. :pr:`26286` by `Thomas Fan`_.
+- |Fix| `compose.ColumnTransformer` raises an informative error when the individual
+  transformers of `ColumnTransformer` output pandas dataframes with indexes that are
+  not consistent with each other and the output is configured to be pandas.
+  :pr:`26286` by `Thomas Fan`_.
 
 - |Fix| :class:`compose.ColumnTransformer` correctly sets the output of the
   remainder when `set_output` is called. :pr:`26323` by `Thomas Fan`_.
@@ -241,6 +253,14 @@ Changelog
 :mod:`sklearn.covariance`
 .........................
 
+- |Fix| Allows `alpha=0` in :class:`covariance.GraphicalLasso` to be
+  consistent with :func:`covariance.graphical_lasso`.
+  :pr:`26033` by :user:`Genesis Valencia <genvalen>`.
+
+- |Fix| :func:`covariance.empirical_covariance` now gives an informative
+  error message when input is not appropriate.
+  :pr:`26108` by :user:`Quentin Barthélemy <qbarthelemy>`.
+
 - |API| Deprecates `cov_init` in :func:`covariance.graphical_lasso` in 1.3 since
   the parameter has no effect. It will be removed in 1.5.
   :pr:`26033` by :user:`Genesis Valencia <genvalen>`.
@@ -256,20 +276,13 @@ Changelog
   :func:`covariance.graphical_lasso_path`, and :class:`covariance.GraphicalLassoCV`.
   :pr:`26033` by :user:`Genesis Valencia <genvalen>`.
 
-- |Fix| Allows `alpha=0` in :class:`covariance.GraphicalLasso` to be
-  consistent with :func:`covariance.graphical_lasso`.
-  :pr:`26033` by :user:`Genesis Valencia <genvalen>`.
-
-- |Fix| :func:`covariance.empirical_covariance` now gives an informative
-  error message when input is not appropriate.
-  :pr:`26108` by :user:`Quentin Barthélemy <qbarthelemy>`.
-
 :mod:`sklearn.datasets`
 .......................
 
-- |API| The `data_transposed` argument of :func:`datasets.make_sparse_coded_signal`
-  is deprecated and will be removed in v1.5.
-  :pr:`25784` by :user:`Jérémie du Boisberranger`.
+- |Enhancement| Allows to overwrite the parameters used to open the ARFF file using
+  the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the
+  pandas parser.
+  :pr:`26433` by :user:`Guillaume Lemaitre <glemaitre>`.
 
 - |Fix| :func:`datasets.fetch_openml` returns improved data types when
   `as_frame=True` and `parser="liac-arff"`. :pr:`26386` by `Thomas Fan`_.
@@ -279,28 +292,35 @@ Changelog
   the pandas parser. The parameter `read_csv_kwargs` allows to overwrite this behaviour.
   :pr:`26551` by :user:`Guillaume Lemaitre <glemaitre>`.
 
-- |Enhancement| Allows to overwrite the parameters used to open the ARFF file using
-  the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the
-  pandas parser.
-  :pr:`26433` by :user:`Guillaume Lemaitre <glemaitre>`.
+- |Fix| :func:`dataasets.fetch_openml` will consistenly use `np.nan` as missing marker
+  with both parsers `"pandas"` and `"liac-arff"`.
+  :pr:`26579` by :user:`Guillaume Lemaitre <glemaitre>`.
+
+- |API| The `data_transposed` argument of :func:`datasets.make_sparse_coded_signal`
+  is deprecated and will be removed in v1.5.
+  :pr:`25784` by :user:`Jérémie du Boisberranger`.
 
 :mod:`sklearn.decomposition`
 ............................
 
-- |Enhancement| :class:`decomposition.DictionaryLearning` now accepts the parameter
-  `callback` for consistency with the function :func:`decomposition.dict_learning`.
-  :pr:`24871` by :user:`Omar Salman <OmarManzoor>`.
-
 - |Efficiency| :class:`decomposition.MiniBatchDictionaryLearning` and
   :class:`decomposition.MiniBatchSparsePCA` are now faster for small batch sizes by
   avoiding duplicate validations.
   :pr:`25490` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+- |Enhancement| :class:`decomposition.DictionaryLearning` now accepts the parameter
+  `callback` for consistency with the function :func:`decomposition.dict_learning`.
+  :pr:`24871` by :user:`Omar Salman <OmarManzoor>`.
+
 - |Fix| Treat more consistently small values in the `W` and `H` matrices during the
   `fit` and `transform` steps of :class:`decomposition.NMF` and
   :class:`decomposition.MiniBatchNMF` which can produce different results than previous
   versions. :pr:`25438` by :user:`Yotam Avidar-Constantini <yotamcons>`.
 
+- |API| The `W` argument in :func:`decomposition.NMF.inverse_transform` and
+  :class:`decomposition.MiniBatchNMF.inverse_transform` is renamed to `Xt` and
+  will be removed in v1.5. :pr:`26503` by `Adrin Jalali`_.
+
 :mod:`sklearn.discriminant_analysis`
 ....................................
 
@@ -364,6 +384,7 @@ Changelog
 
 :mod:`sklearn.exception`
 ........................
+
 - |Feature| Added :class:`exception.InconsistentVersionWarning` which is raised
   when a scikit-learn estimator is unpickled with a scikit-learn version that is
   inconsistent with the sckit-learn version the estimator was pickled with.
@@ -393,6 +414,9 @@ Changelog
 - |Enhancement| Added the parameter `fill_value` to :class:`impute.IterativeImputer`.
   :pr:`25232` by :user:`Thijs van Weezel <ValueInvestorThijs>`.
 
+- |Fix| :class:`impute.IterativeImputer` now correctly preserves the Pandas
+  Index when the `set_config(transform_output="pandas")`. :pr:`26454` by `Thomas Fan`_.
+
 :mod:`sklearn.inspection`
 .........................
 
@@ -420,12 +444,6 @@ Changelog
   now preserve dtype for `numpy.float32`.
   :pr:`25587` by :user:`Omar Salman <OmarManzoor>`.
 
-- |API| Deprecates `n_iter` in favor of `max_iter` in
-  :class:`linear_model.BayesianRidge` and :class:`linear_model.ARDRegression`.
-  `n_iter` will be removed in scikit-learn 1.5. This change makes those
-  estimators consistent with the rest of estimators.
-  :pr:`25697` by :user:`John Pangas <jpangas>`.
-
 - |Enhancement| The `n_iter_` attribute has been included in
   :class:`linear_model.ARDRegression` to expose the actual number of iterations
   required to reach the stopping criterion.
@@ -436,36 +454,41 @@ Changelog
   on linearly separable problems.
   :pr:`25214` by `Tom Dupre la Tour`_.
 
+- |API| Deprecates `n_iter` in favor of `max_iter` in
+  :class:`linear_model.BayesianRidge` and :class:`linear_model.ARDRegression`.
+  `n_iter` will be removed in scikit-learn 1.5. This change makes those
+  estimators consistent with the rest of estimators.
+  :pr:`25697` by :user:`John Pangas <jpangas>`.
+
+:mod:`sklearn.manifold`
+.......................
+
+- |Fix| :class:`manifold.Isomap` now correctly preserves the Pandas
+  Index when the `set_config(transform_output="pandas")`. :pr:`26454` by `Thomas Fan`_.
+
 :mod:`sklearn.metrics`
 ......................
 
-- |Efficiency| The computation of the expected mutual information in
-  :func:`metrics.adjusted_mutual_info_score` is now faster when the number of
-  unique labels is large and its memory usage is reduced in general.
-  :pr:`25713` by :user:`Kshitij Mathur <Kshitij68>`,
-  :user:`Guillaume Lemaitre <glemaitre>`, :user:`Omar Salman <OmarManzoor>` and
-  :user:`Jérémie du Boisberranger <jeremiedbb>`.
-
 - |Feature| Adds `zero_division=np.nan` to multiple classification metrics:
-  :func:`precision_score`, :func:`recall_score`, :func:`f1_score`,
-  :func:`fbeta_score`, :func:`precision_recall_fscore_support`,
-  :func:`classification_report`. When `zero_division=np.nan` and there is a
+  :func:`metrics.precision_score`, :func:`metrics.recall_score`,
+  :func:`metrics.f1_score`, :func:`metrics.fbeta_score`,
+  :func:`metrics.precision_recall_fscore_support`,
+  :func:`metrics.classification_report`. When `zero_division=np.nan` and there is a
   zero division, the metric is undefined and is excluded from averaging. When not used
   for averages, the value returned is `np.nan`.
   :pr:`25531` by :user:`Marc Torrellas Socastro <marctorsoc>`.
 
-- |Fix| :func:`metric.manhattan_distances` now supports readonly sparse datasets.
-  :pr:`25432` by :user:`Julien Jerphanion <jjerphan>`.
-
-- |Fix| Fixed :func:`classification_report` so that empty input will return
-  `np.nan`. Previously, "macro avg" and `weighted avg` would return
-  e.g. `f1-score=np.nan` and `f1-score=0.0`, being inconsistent. Now, they
-  both return `np.nan`.
-  :pr:`25531` by :user:`Marc Torrellas Socastro <marctorsoc>`.
+- |Feature| :func:`metrics.average_precision_score` now supports the
+  multiclass case.
+  :pr:`17388` by :user:`Geoffrey Bolmier <gbolmier>` and
+  :pr:`24769` by :user:`Ashwin Mathur <awinml>`.
 
-- |Fix| :func:`metric.ndcg_score` now gives a meaningful error message for input of
-  length 1.
-  :pr:`25672` by :user:`Lene Preuss <lene>` and :user:`Wei-Chun Chu <wcchu>`.
+- |Efficiency| The computation of the expected mutual information in
+  :func:`metrics.adjusted_mutual_info_score` is now faster when the number of
+  unique labels is large and its memory usage is reduced in general.
+  :pr:`25713` by :user:`Kshitij Mathur <Kshitij68>`,
+  :user:`Guillaume Lemaitre <glemaitre>`, :user:`Omar Salman <OmarManzoor>` and
+  :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
 - |Enhancement| :class:`metrics.silhouette_samples` nows accepts a sparse
   matrix of pairwise distances between samples, or a feature array.
@@ -492,17 +515,23 @@ Changelog
   chance level. This line is exposed in the `chance_level_` attribute.
   :pr:`26019` by :user:`Yao Xiao <Charlie-XIAO>`.
 
-- |Fix| :func:`log_loss` raises a warning if the values of the parameter `y_pred` are
-  not normalized, instead of actually normalizing them in the metric. Starting from
-  1.5 this will raise an error. :pr:`25299` by :user:`Omar Salman <OmarManzoor`.
+- |Fix| :func:`metrics.manhattan_distances` now supports readonly sparse datasets.
+  :pr:`25432` by :user:`Julien Jerphanion <jjerphan>`.
+
+- |Fix| Fixed :func:`metrics.classification_report` so that empty input will return
+  `np.nan`. Previously, "macro avg" and `weighted avg` would return
+  e.g. `f1-score=np.nan` and `f1-score=0.0`, being inconsistent. Now, they
+  both return `np.nan`.
+  :pr:`25531` by :user:`Marc Torrellas Socastro <marctorsoc>`.
 
-- |API| The `eps` parameter of the :func:`log_loss` has been deprecated and will be
-  removed in 1.5. :pr:`25299` by :user:`Omar Salman <OmarManzoor>`.
+- |Fix| :func:`metrics.ndcg_score` now gives a meaningful error message for input of
+  length 1.
+  :pr:`25672` by :user:`Lene Preuss <lene>` and :user:`Wei-Chun Chu <wcchu>`.
 
-- |Feature| :func:`metrics.average_precision_score` now supports the
-  multiclass case.
-  :pr:`17388` by :user:`Geoffrey Bolmier <gbolmier>` and
-  :pr:`24769` by :user:`Ashwin Mathur <awinml>`.
+- |Fix| :func:`metrics.log_loss` raises a warning if the values of the parameter
+  `y_pred` are not normalized, instead of actually normalizing them in the metric.
+  Starting from 1.5 this will raise an error.
+  :pr:`25299` by :user:`Omar Salman <OmarManzoor`.
 
 - |Fix| In :func:`metrics.roc_curve`, use the threshold value `np.inf` instead of
   arbitrary `max(y_score) + 1`. This threshold is associated with the ROC curve point
@@ -514,6 +543,9 @@ Changelog
   `'matching'` anymore.
   :pr:`26264` by :user:`Barata T. Onggo <magnusbarata>`
 
+- |API| The `eps` parameter of the :func:`metrics.log_loss` has been deprecated and
+  will be removed in 1.5. :pr:`25299` by :user:`Omar Salman <OmarManzoor>`.
+
 :mod:`sklearn.gaussian_process`
 ...............................
 
@@ -524,6 +556,18 @@ Changelog
 :mod:`sklearn.model_selection`
 ..............................
 
+- |MajorFeature| Added the class :class:`model_selection.ValidationCurveDisplay`
+  that allows easy plotting of validation curves obtained by the function
+  :func:`model_selection.validation_curve`.
+  :pr:`25120` by :user:`Guillaume Lemaitre <glemaitre>`.
+
+- |API| The parameter `log_scale` in the class
+  :class:`model_selection.LearningCurveDisplay` has been deprecated in 1.3 and
+  will be removed in 1.5. The default scale can be overriden by setting it
+  directly on the `ax` object and will be set automatically from the spacing
+  of the data points otherwise.
+  :pr:`25120` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 - |Enhancement| :func:`model_selection.cross_validate` accepts a new parameter
   `return_indices` to return the train-test indices of each cv split.
   :pr:`25659` by :user:`Guillaume Lemaitre <glemaitre>`.
@@ -546,15 +590,15 @@ Changelog
 :mod:`sklearn.neighbors`
 ........................
 
-- |Fix| Remove support for `KulsinskiDistance` in :class:`neighbors.BallTree`. This
-  dissimilarity is not a metric and cannot be supported by the BallTree.
-  :pr:`25417` by :user:`Guillaume Lemaitre <glemaitre>`.
-
 - |Enhancement| The performance of :meth:`neighbors.KNeighborsClassifier.predict`
   and of :meth:`neighbors.KNeighborsClassifier.predict_proba` has been improved
   when `n_neighbors` is large and `algorithm="brute"` with non Euclidean metrics.
   :pr:`24076` by :user:`Meekail Zain <micky774>`, :user:`Julien Jerphanion <jjerphan>`.
 
+- |Fix| Remove support for `KulsinskiDistance` in :class:`neighbors.BallTree`. This
+  dissimilarity is not a metric and cannot be supported by the BallTree.
+  :pr:`25417` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 - |API| The support for metrics other than `euclidean` and `manhattan` and for
   callables in :class:`neighbors.NearestNeighbors` is deprecated and will be removed in
   version 1.5. :pr:`24083` by :user:`Valentin Laurent <Valentin-Laurent>`.
@@ -592,10 +636,24 @@ Changelog
   categorical encoding based on target mean conditioned on the value of the
   category. :pr:`25334` by `Thomas Fan`_.
 
+- |Feature| :class:`preprocessing.OrdinalEncoder` now supports grouping
+  infrequent categories into a single feature. Grouping infrequent categories
+  is enabled by specifying how to select infrequent categories with
+  `min_frequency` or `max_categories`. :pr:`25677` by `Thomas Fan`_.
+
+- |Enhancement| :class:`preprocessing.PolynomialFeatures` now calculates the
+  number of expanded terms a-priori when dealing with sparse `csr` matrices
+  in order to optimize the choice of `dtype` for `indices` and `indptr`. It
+  can now output `csr` matrices with `np.int32` `indices/indptr` components
+  when there are few enough elements, and will automatically use `np.int64`
+  for sufficiently large matrices.
+  :pr:`20524` by :user:`niuk-a <niuk-a>` and
+  :pr:`23731` by :user:`Meekail Zain <micky774>`
+
 - |Enhancement| A new parameter `sparse_output` was added to
-  :class:`SplineTransformer`, available as of SciPy 1.8. If `sparse_output=True`,
-  :class:`SplineTransformer` returns a sparse CSR matrix.
-  :pr:`24145` by :user:`Christian Lorentzen <lorentzenchr>`.
+  :class:`preprocessing.SplineTransformer`, available as of SciPy 1.8. If
+  `sparse_output=True`, :class:`preprocessing.SplineTransformer` returns a sparse
+  CSR matrix. :pr:`24145` by :user:`Christian Lorentzen <lorentzenchr>`.
 
 - |Enhancement| Adds a `feature_name_combiner` parameter to
   :class:`preprocessing.OneHotEncoder`. This specifies a custom callable to create
@@ -610,28 +668,35 @@ Changelog
   :pr:`24935` by :user:`Seladus <seladus>`, :user:`Guillaume Lemaitre <glemaitre>`, and
   :user:`Dea María Léon <deamarialeon>`, :pr:`25257` by :user:`Gleb Levitski <glevv>`.
 
-- |Feature| :class:`preprocessing.OrdinalEncoder` now supports grouping
-  infrequent categories into a single feature. Grouping infrequent categories
-  is enabled by specifying how to select infrequent categories with
-  `min_frequency` or `max_categories`. :pr:`25677` by `Thomas Fan`_.
-
 - |Enhancement| Subsampling through the `subsample` parameter can now be used in
   :class:`preprocessing.KBinsDiscretizer` regardless of the strategy used.
   :pr:`26424` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
-- |API| The default value of the `subsample` parameter of
-  :class:`preprocessing.KBinsDiscretizer` will change from `None` to `200_000` in
-  version 1.5 when `strategy="kmeans"` or `strategy="uniform"`.
-  :pr:`26424` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+- |Fix| :class:`preprocessing.AdditiveChi2Sampler` is now stateless.
+  The `sample_interval_` attribute is deprecated and will be removed in 1.5.
+  :pr:`25190` by :user:`Vincent Maladière <Vincent-Maladiere>`.
 
 - |Fix| :class:`AdditiveChi2Sampler` is now stateless.
   The `sample_interval_` attribute is deprecated and will be removed in 1.5.
   :pr:`25190` by :user:`Vincent Maladière <Vincent-Maladiere>`.
 
+- |Fix| :class:`preprocessing.PowerTransformer` now correctly preserves the Pandas
+  Index when the `set_config(transform_output="pandas")`. :pr:`26454` by `Thomas Fan`_.
+
 - |Fix| :class:`preprocessing.PowerTransformer` now correcly raises error when
   using `method="box-cox"` on data with a constant `np.nan` column.
   :pr:`26400` by :user:`Yao Xiao <Charlie-XIAO>`.
 
+- |Fix| :class:`preprocessing.PowerTransformer` with `method="yeo-johnson"` now leaves
+  constant features unchanged instead of transforming with an arbitrary value for
+  the `lambdas_` fitted parameter.
+  :pr:`26566` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
+- |API| The default value of the `subsample` parameter of
+  :class:`preprocessing.KBinsDiscretizer` will change from `None` to `200_000` in
+  version 1.5 when `strategy="kmeans"` or `strategy="uniform"`.
+  :pr:`26424` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
 :mod:`sklearn.svm`
 ..................
 
@@ -660,45 +725,36 @@ Changelog
 :mod:`sklearn.utils`
 ....................
 
-- |API| :func:`estimator_checks.check_transformers_unfitted_stateless` has been
+- |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas
+  extension arrays. :pr:`25813` and :pr:`26106` by `Thomas Fan`_.
+
+- |Fix| :func:`utils.validation.check_array` now supports pandas DataFrames with
+  extension arrays and object dtypes by return an ndarray with object dtype.
+  :pr:`25814` by `Thomas Fan`_.
+
+- |API| :func:`utils.estimator_checks.check_transformers_unfitted_stateless` has been
   introduced to ensure stateless transformers don't raise `NotFittedError`
   during `transform` with no prior call to `fit` or `fit_transform`.
   :pr:`25190` by :user:`Vincent Maladière <Vincent-Maladiere>`.
 
-- |Enhancement| :class:`preprocessing.PolynomialFeatures` now calculates the
-  number of expanded terms a-priori when dealing with sparse `csr` matrices
-  in order to optimize the choice of `dtype` for `indices` and `indptr`. It
-  can now output `csr` matrices with `np.int32` `indices/indptr` components
-  when there are few enough elements, and will automatically use `np.int64`
-  for sufficiently large matrices.
-  :pr:`20524` by :user:`niuk-a <niuk-a>` and
-  :pr:`23731` by :user:`Meekail Zain <micky774>`
-
 - |API| A `FutureWarning` is now raised when instantiating a class which inherits from
   a deprecated base class (i.e. decorated by :class:`utils.deprecated`) and which
   overrides the `__init__` method.
   :pr:`25733` by :user:`Brigitta Sipőcz <bsipocz>` and
   :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
-- |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas
-  extension arrays. :pr:`25813` and :pr:`26106` by `Thomas Fan`_.
-
-- |Fix| :func:`utils.validation.check_array` now supports pandas DataFrames with
-  extension arrays and object dtypes by return an ndarray with object dtype.
-  :pr:`25814` by `Thomas Fan`_.
-
 :mod:`sklearn.semi_supervised`
 ..............................
 
-- |Enhancement| :meth:`LabelSpreading.fit` and :meth:`LabelPropagation.fit` now
-  accepts sparse metrics.
+- |Enhancement| :meth:`semi_supervised.LabelSpreading.fit` and
+  :meth:`semi_supervised.LabelPropagation.fit` now accepts sparse metrics.
   :pr:`19664` by :user:`Kaushik Amar Das <cozek>`.
 
 Miscellaneous
 .............
 
-- |Enhancement| Replace obsolete exceptions EnvironmentError, IOError and
-  WindowsError.
+- |Enhancement| Replace obsolete exceptions `EnvironmentError`, `IOError` and
+  `WindowsError`.
   :pr:`26466` by :user:`Dimitri Papadopoulos ORfanos <DimitriPapadopoulos>`.
 
 Code and Documentation Contributors
diff --git a/examples/miscellaneous/plot_kernel_ridge_regression.py b/examples/miscellaneous/plot_kernel_ridge_regression.py
index 20b8496ab18aa..fa7cb15446473 100644
--- a/examples/miscellaneous/plot_kernel_ridge_regression.py
+++ b/examples/miscellaneous/plot_kernel_ridge_regression.py
@@ -203,6 +203,7 @@
     "scoring": "neg_mean_squared_error",
     "negate_score": True,
     "score_name": "Mean Squared Error",
+    "score_type": "test",
     "std_display_style": None,
     "ax": ax,
 }
diff --git a/examples/model_selection/plot_validation_curve.py b/examples/model_selection/plot_validation_curve.py
index 1b3c562594188..48aa19dfbc556 100644
--- a/examples/model_selection/plot_validation_curve.py
+++ b/examples/model_selection/plot_validation_curve.py
@@ -18,53 +18,23 @@
 
 from sklearn.datasets import load_digits
 from sklearn.svm import SVC
-from sklearn.model_selection import validation_curve
+from sklearn.model_selection import ValidationCurveDisplay
 
 X, y = load_digits(return_X_y=True)
 subset_mask = np.isin(y, [1, 2])  # binary classification: 1 vs 2
 X, y = X[subset_mask], y[subset_mask]
 
-param_range = np.logspace(-6, -1, 5)
-train_scores, test_scores = validation_curve(
+disp = ValidationCurveDisplay.from_estimator(
     SVC(),
     X,
     y,
     param_name="gamma",
-    param_range=param_range,
-    scoring="accuracy",
+    param_range=np.logspace(-6, -1, 5),
+    score_type="both",
     n_jobs=2,
+    score_name="Accuracy",
 )
-train_scores_mean = np.mean(train_scores, axis=1)
-train_scores_std = np.std(train_scores, axis=1)
-test_scores_mean = np.mean(test_scores, axis=1)
-test_scores_std = np.std(test_scores, axis=1)
-
-plt.title("Validation Curve with SVM")
-plt.xlabel(r"$\gamma$")
-plt.ylabel("Score")
-plt.ylim(0.0, 1.1)
-lw = 2
-plt.semilogx(
-    param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw
-)
-plt.fill_between(
-    param_range,
-    train_scores_mean - train_scores_std,
-    train_scores_mean + train_scores_std,
-    alpha=0.2,
-    color="darkorange",
-    lw=lw,
-)
-plt.semilogx(
-    param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw
-)
-plt.fill_between(
-    param_range,
-    test_scores_mean - test_scores_std,
-    test_scores_mean + test_scores_std,
-    alpha=0.2,
-    color="navy",
-    lw=lw,
-)
-plt.legend(loc="best")
+disp.ax_.set_title("Validation Curve for SVM with an RBF kernel")
+disp.ax_.set_xlabel(r"gamma (inverse radius of the RBF kernel)")
+disp.ax_.set_ylim(0.0, 1.1)
 plt.show()
diff --git a/sklearn/base.py b/sklearn/base.py
index 5cced34d4b8f0..13bbcab96aa61 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -27,7 +27,7 @@
 from .utils.validation import _num_features
 from .utils.validation import _check_feature_names_in
 from .utils.validation import _generate_get_feature_names_out
-from .utils.validation import check_is_fitted
+from .utils.validation import _is_fitted, check_is_fitted
 from .utils._metadata_requests import _MetadataRequester
 from .utils.validation import _get_feature_names
 from .utils._estimator_html_repr import estimator_html_repr
@@ -1131,7 +1131,13 @@ def decorator(fit_method):
         @functools.wraps(fit_method)
         def wrapper(estimator, *args, **kwargs):
             global_skip_validation = get_config()["skip_parameter_validation"]
-            if not global_skip_validation:
+
+            # we don't want to validate again for each call to partial_fit
+            partial_fit_and_fitted = (
+                fit_method.__name__ == "partial_fit" and _is_fitted(estimator)
+            )
+
+            if not global_skip_validation and not partial_fit_and_fitted:
                 estimator._validate_params()
 
             with config_context(
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 5e7bfe2ab4a31..e4869387f4166 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -25,6 +25,7 @@
     RegressorMixin,
     clone,
     MetaEstimatorMixin,
+    _fit_context,
 )
 from .preprocessing import label_binarize, LabelEncoder
 from .utils import (
@@ -318,6 +319,10 @@ def _get_estimator(self):
 
         return estimator
 
+    @_fit_context(
+        # CalibratedClassifierCV.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None, **fit_params):
         """Fit the calibrated model.
 
@@ -341,8 +346,6 @@ def fit(self, X, y, sample_weight=None, **fit_params):
         self : object
             Returns an instance of self.
         """
-        self._validate_params()
-
         check_classification_targets(y)
         X, y = indexable(X, y)
         if sample_weight is not None:
diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index 8a3c2c2acde62..1ffc5f07e8c50 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -12,6 +12,7 @@
 
 from ..exceptions import ConvergenceWarning
 from ..base import BaseEstimator, ClusterMixin
+from ..base import _fit_context
 from ..utils import check_random_state
 from ..utils._param_validation import Interval, StrOptions, validate_params
 from ..utils.validation import check_is_fitted
@@ -469,6 +470,7 @@ def __init__(
     def _more_tags(self):
         return {"pairwise": self.affinity == "precomputed"}
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the clustering from features, or affinity matrix.
 
@@ -488,8 +490,6 @@ def fit(self, X, y=None):
         self
             Returns the instance itself.
         """
-        self._validate_params()
-
         if self.affinity == "precomputed":
             accept_sparse = False
         else:
diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index 059056275ef3d..b7d08a45dcd80 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -16,6 +16,7 @@
 from scipy.sparse.csgraph import connected_components
 
 from ..base import BaseEstimator, ClusterMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..metrics.pairwise import paired_distances
 from ..metrics.pairwise import _VALID_METRICS
 from ..metrics import DistanceMetric
@@ -950,6 +951,7 @@ def __init__(
         self.metric = metric
         self.compute_distances = compute_distances
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the hierarchical clustering from features, or distance matrix.
 
@@ -968,7 +970,6 @@ def fit(self, X, y=None):
         self : object
             Returns the fitted instance.
         """
-        self._validate_params()
         X = self._validate_data(X, ensure_min_samples=2)
         return self._fit(X)
 
@@ -1324,6 +1325,7 @@ def __init__(
         )
         self.pooling_func = pooling_func
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the hierarchical clustering on the data.
 
@@ -1340,7 +1342,6 @@ def fit(self, X, y=None):
         self : object
             Returns the transformer.
         """
-        self._validate_params()
         X = self._validate_data(X, ensure_min_features=2)
         super()._fit(X.T)
         self._n_features_out = self.n_clusters_
diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
index ba837bacc99d5..4133264626ebb 100644
--- a/sklearn/cluster/_bicluster.py
+++ b/sklearn/cluster/_bicluster.py
@@ -13,6 +13,7 @@
 
 from . import KMeans, MiniBatchKMeans
 from ..base import BaseEstimator, BiclusterMixin
+from ..base import _fit_context
 from ..utils import check_random_state
 from ..utils import check_scalar
 
@@ -118,6 +119,7 @@ def __init__(
     def _check_parameters(self, n_samples):
         """Validate parameters depending on the input data."""
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Create a biclustering for X.
 
@@ -134,8 +136,6 @@ def fit(self, X, y=None):
         self : object
             SpectralBiclustering instance.
         """
-        self._validate_params()
-
         X = self._validate_data(X, accept_sparse="csr", dtype=np.float64)
         self._check_parameters(X.shape[0])
         self._fit(X)
diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index 4c9d7921fdc70..e74630572a014 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -16,6 +16,7 @@
     ClusterMixin,
     BaseEstimator,
     ClassNamePrefixFeaturesOutMixin,
+    _fit_context,
 )
 from ..utils.extmath import row_norms
 from ..utils._param_validation import Interval
@@ -501,6 +502,7 @@ def __init__(
         self.compute_labels = compute_labels
         self.copy = copy
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """
         Build a CF Tree for the input data.
@@ -518,9 +520,6 @@ def fit(self, X, y=None):
         self
             Fitted estimator.
         """
-
-        self._validate_params()
-
         return self._fit(X, partial=False)
 
     def _fit(self, X, partial):
@@ -610,6 +609,7 @@ def _get_leaves(self):
             leaf_ptr = leaf_ptr.next_leaf_
         return leaves
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X=None, y=None):
         """
         Online learning. Prevents rebuilding of CFTree from scratch.
@@ -629,8 +629,6 @@ def partial_fit(self, X=None, y=None):
         self
             Fitted estimator.
         """
-        self._validate_params()
-
         if X is None:
             # Perform just the final global clustering step.
             self._global_clustering()
diff --git a/sklearn/cluster/_bisect_k_means.py b/sklearn/cluster/_bisect_k_means.py
index fc2b38cc1bca9..959d78ae85009 100644
--- a/sklearn/cluster/_bisect_k_means.py
+++ b/sklearn/cluster/_bisect_k_means.py
@@ -6,6 +6,7 @@
 import numpy as np
 import scipy.sparse as sp
 
+from ..base import _fit_context
 from ._kmeans import _BaseKMeans
 from ._kmeans import _kmeans_single_elkan
 from ._kmeans import _kmeans_single_lloyd
@@ -347,6 +348,7 @@ def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect):
 
         cluster_to_bisect.split(best_labels, best_centers, scores)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None, sample_weight=None):
         """Compute bisecting k-means clustering.
 
@@ -373,8 +375,6 @@ def fit(self, X, y=None, sample_weight=None):
         self
             Fitted estimator.
         """
-        self._validate_params()
-
         X = self._validate_data(
             X,
             accept_sparse="csr",
diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
index aa81ef27702e6..3c753935ac046 100644
--- a/sklearn/cluster/_dbscan.py
+++ b/sklearn/cluster/_dbscan.py
@@ -16,6 +16,7 @@
 
 from ..metrics.pairwise import _VALID_METRICS
 from ..base import BaseEstimator, ClusterMixin
+from ..base import _fit_context
 from ..utils.validation import _check_sample_weight
 from ..utils._param_validation import Interval, StrOptions
 from ..neighbors import NearestNeighbors
@@ -338,6 +339,10 @@ def __init__(
         self.p = p
         self.n_jobs = n_jobs
 
+    @_fit_context(
+        # DBSCAN.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None, sample_weight=None):
         """Perform DBSCAN clustering from features, or distance matrix.
 
@@ -363,8 +368,6 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Returns a fitted instance of self.
         """
-        self._validate_params()
-
         X = self._validate_data(X, accept_sparse="csr")
 
         if sample_weight is not None:
diff --git a/sklearn/cluster/_feature_agglomeration.py b/sklearn/cluster/_feature_agglomeration.py
index 457a83dd41e71..55baf247a2931 100644
--- a/sklearn/cluster/_feature_agglomeration.py
+++ b/sklearn/cluster/_feature_agglomeration.py
@@ -5,10 +5,12 @@
 # Author: V. Michel, A. Gramfort
 # License: BSD 3 clause
 
+import warnings
 import numpy as np
 
 from ..base import TransformerMixin
 from ..utils.validation import check_is_fitted
+from ..utils import metadata_routing
 from scipy.sparse import issparse
 
 ###############################################################################
@@ -20,6 +22,11 @@ class AgglomerationTransform(TransformerMixin):
     A class for feature agglomeration via the transform interface.
     """
 
+    # This prevents ``set_split_inverse_transform`` to be generated for the
+    # non-standard ``Xred`` arg on ``inverse_transform``.
+    # TODO(1.5): remove when Xred is removed for inverse_transform.
+    __metadata_request__inverse_transform = {"Xred": metadata_routing.UNUSED}
+
     def transform(self, X):
         """
         Transform a new matrix using the built clustering.
@@ -54,22 +61,43 @@ def transform(self, X):
             nX = np.array(nX).T
         return nX
 
-    def inverse_transform(self, Xred):
+    def inverse_transform(self, Xt=None, Xred=None):
         """
         Inverse the transformation and return a vector of size `n_features`.
 
         Parameters
         ----------
-        Xred : array-like of shape (n_samples, n_clusters) or (n_clusters,)
+        Xt : array-like of shape (n_samples, n_clusters) or (n_clusters,)
             The values to be assigned to each cluster of samples.
 
+        Xred : deprecated
+            Use `Xt` instead.
+
+            .. deprecated:: 1.3
+
         Returns
         -------
         X : ndarray of shape (n_samples, n_features) or (n_features,)
             A vector of size `n_samples` with the values of `Xred` assigned to
             each of the cluster of samples.
         """
+        if Xt is None and Xred is None:
+            raise TypeError("Missing required positional argument: Xt")
+
+        if Xred is not None and Xt is not None:
+            raise ValueError("Please provide only `Xt`, and not `Xred`.")
+
+        if Xred is not None:
+            warnings.warn(
+                (
+                    "Input argument `Xred` was renamed to `Xt` in v1.3 and will be"
+                    " removed in v1.5."
+                ),
+                FutureWarning,
+            )
+            Xt = Xred
+
         check_is_fitted(self)
 
         unil, inverse = np.unique(self.labels_, return_inverse=True)
-        return Xred[..., inverse]
+        return Xt[..., inverse]
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 971d5735fbe2b..b36999885a14e 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -23,6 +23,7 @@
     ClusterMixin,
     TransformerMixin,
     ClassNamePrefixFeaturesOutMixin,
+    _fit_context,
 )
 from ..metrics.pairwise import euclidean_distances
 from ..metrics.pairwise import _euclidean_distances
@@ -1448,6 +1449,7 @@ def _warn_mkl_vcomp(self, n_active_threads):
             f" variable OMP_NUM_THREADS={n_active_threads}."
         )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None, sample_weight=None):
         """Compute k-means clustering.
 
@@ -1475,8 +1477,6 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         X = self._validate_data(
             X,
             accept_sparse="csr",
@@ -2057,6 +2057,7 @@ def _random_reassign(self):
             return True
         return False
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None, sample_weight=None):
         """Compute the centroids on X by chunking it into mini-batches.
 
@@ -2084,8 +2085,6 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         X = self._validate_data(
             X,
             accept_sparse="csr",
@@ -2214,6 +2213,7 @@ def fit(self, X, y=None, sample_weight=None):
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None, sample_weight=None):
         """Update k means estimate on a single mini-batch X.
 
@@ -2241,9 +2241,6 @@ def partial_fit(self, X, y=None, sample_weight=None):
         """
         has_centers = hasattr(self, "cluster_centers_")
 
-        if not has_centers:
-            self._validate_params()
-
         X = self._validate_data(
             X,
             accept_sparse="csr",
diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py
index 46a00ed3f0740..6b0f227d011f9 100644
--- a/sklearn/cluster/_mean_shift.py
+++ b/sklearn/cluster/_mean_shift.py
@@ -24,6 +24,7 @@
 from ..utils.parallel import delayed, Parallel
 from ..utils import check_random_state, gen_batches, check_array
 from ..base import BaseEstimator, ClusterMixin
+from ..base import _fit_context
 from ..neighbors import NearestNeighbors
 from ..metrics.pairwise import pairwise_distances_argmin
 from .._config import config_context
@@ -435,6 +436,7 @@ def __init__(
         self.n_jobs = n_jobs
         self.max_iter = max_iter
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Perform clustering.
 
@@ -451,7 +453,6 @@ def fit(self, X, y=None):
         self : object
                Fitted instance.
         """
-        self._validate_params()
         X = self._validate_data(X)
         bandwidth = self.bandwidth
         if bandwidth is None:
diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py
index 0f1c66ada2d4e..ca1c74d6f44e7 100755
--- a/sklearn/cluster/_optics.py
+++ b/sklearn/cluster/_optics.py
@@ -24,6 +24,7 @@
 from ..utils.validation import check_memory
 from ..neighbors import NearestNeighbors
 from ..base import BaseEstimator, ClusterMixin
+from ..base import _fit_context
 from ..metrics import pairwise_distances
 from scipy.sparse import issparse, SparseEfficiencyWarning
 
@@ -288,6 +289,10 @@ def __init__(
         self.memory = memory
         self.n_jobs = n_jobs
 
+    @_fit_context(
+        # Optics.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None):
         """Perform OPTICS clustering.
 
@@ -311,8 +316,6 @@ def fit(self, X, y=None):
         self : object
             Returns a fitted instance of self.
         """
-        self._validate_params()
-
         dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float
         if dtype == bool and X.dtype != bool:
             msg = (
diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
index e0ab7da938bfd..f72db4b7c1da3 100644
--- a/sklearn/cluster/_spectral.py
+++ b/sklearn/cluster/_spectral.py
@@ -15,6 +15,7 @@
 from scipy.sparse import csc_matrix
 
 from ..base import BaseEstimator, ClusterMixin
+from ..base import _fit_context
 from ..utils._param_validation import Interval, StrOptions, validate_params
 from ..utils import check_random_state, as_float_array
 from ..metrics.pairwise import pairwise_kernels, KERNEL_PARAMS
@@ -649,6 +650,7 @@ def __init__(
         self.n_jobs = n_jobs
         self.verbose = verbose
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Perform spectral clustering from features, or affinity matrix.
 
@@ -671,8 +673,6 @@ def fit(self, X, y=None):
         self : object
             A fitted instance of the estimator.
         """
-        self._validate_params()
-
         X = self._validate_data(
             X,
             accept_sparse=["csr", "csc", "coo"],
diff --git a/sklearn/cluster/tests/test_feature_agglomeration.py b/sklearn/cluster/tests/test_feature_agglomeration.py
index 3e4aa816b79c0..3db2862384c74 100644
--- a/sklearn/cluster/tests/test_feature_agglomeration.py
+++ b/sklearn/cluster/tests/test_feature_agglomeration.py
@@ -2,9 +2,11 @@
 Tests for sklearn.cluster._feature_agglomeration
 """
 # Authors: Sergul Aydore 2017
+import warnings
 import numpy as np
 
 from numpy.testing import assert_array_equal
+import pytest
 from sklearn.cluster import FeatureAgglomeration
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.datasets import make_blobs
@@ -53,3 +55,25 @@ def test_feature_agglomeration_feature_names_out():
     assert_array_equal(
         [f"featureagglomeration{i}" for i in range(n_clusters)], names_out
     )
+
+
+# TODO(1.5): remove this test
+def test_inverse_transform_Xred_deprecation():
+    X = np.array([0, 0, 1]).reshape(1, 3)  # (n_samples, n_features)
+
+    est = FeatureAgglomeration(n_clusters=1, pooling_func=np.mean)
+    est.fit(X)
+    Xt = est.transform(X)
+
+    with pytest.raises(TypeError, match="Missing required positional argument"):
+        est.inverse_transform()
+
+    with pytest.raises(ValueError, match="Please provide only"):
+        est.inverse_transform(Xt=Xt, Xred=Xt)
+
+    with warnings.catch_warnings(record=True):
+        warnings.simplefilter("error")
+        est.inverse_transform(Xt)
+
+    with pytest.warns(FutureWarning, match="Input argument `Xred` was renamed to `Xt`"):
+        est.inverse_transform(Xred=Xt)
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index aab021c0c8d4f..14349662cfee9 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -14,6 +14,7 @@
 from scipy import sparse
 
 from ..base import clone, TransformerMixin
+from ..base import _fit_context
 from ..utils._estimator_html_repr import _VisualBlock
 from ..pipeline import _fit_transform_one, _transform_one, _name_estimators
 from ..preprocessing import FunctionTransformer
@@ -701,12 +702,15 @@ def fit(self, X, y=None):
         self : ColumnTransformer
             This estimator.
         """
-        self._validate_params()
         # we use fit_transform to make sure to set sparse_output_ (for which we
         # need the transformed data) to have consistent output type in predict
         self.fit_transform(X, y=y)
         return self
 
+    @_fit_context(
+        # estimators in ColumnTransformer.transformers are not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit_transform(self, X, y=None):
         """Fit all transformers, transform the data and concatenate results.
 
@@ -728,7 +732,6 @@ def fit_transform(self, X, y=None):
             any result is a sparse matrix, everything will be converted to
             sparse matrices.
         """
-        self._validate_params()
         self._check_feature_names(X, reset=True)
 
         X = _check_X(X)
diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
index f31a5a49b641e..e926ed7abe324 100644
--- a/sklearn/compose/_target.py
+++ b/sklearn/compose/_target.py
@@ -7,6 +7,7 @@
 import numpy as np
 
 from ..base import BaseEstimator, RegressorMixin, clone
+from ..base import _fit_context
 from ..utils.validation import check_is_fitted
 from ..utils._tags import _safe_tags
 from ..utils import check_array, _safe_indexing
@@ -197,6 +198,10 @@ def _fit_transformer(self, y):
                     UserWarning,
                 )
 
+    @_fit_context(
+        # TransformedTargetRegressor.regressor/transformer are not validated yet.
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, **fit_params):
         """Fit the model according to the given training data.
 
@@ -218,7 +223,6 @@ def fit(self, X, y, **fit_params):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         if y is None:
             raise ValueError(
                 f"This {self.__class__.__name__} estimator "
diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py
index 1ef0eedd62f64..c99f200592580 100644
--- a/sklearn/covariance/_elliptic_envelope.py
+++ b/sklearn/covariance/_elliptic_envelope.py
@@ -9,6 +9,7 @@
 from ..utils.validation import check_is_fitted
 from ..metrics import accuracy_score
 from ..base import OutlierMixin
+from ..base import _fit_context
 
 
 class EllipticEnvelope(OutlierMixin, MinCovDet):
@@ -162,6 +163,7 @@ def __init__(
         )
         self.contamination = contamination
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the EllipticEnvelope model.
 
@@ -178,7 +180,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        # `_validate_params` is called in `MinCovDet`
         super().fit(X)
         self.offset_ = np.percentile(-self.dist_, 100.0 * self.contamination)
         return self
diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py
index 7fc23f36d92d3..8083bfd2e1aa1 100644
--- a/sklearn/covariance/_empirical_covariance.py
+++ b/sklearn/covariance/_empirical_covariance.py
@@ -16,6 +16,7 @@
 
 from .. import config_context
 from ..base import BaseEstimator
+from ..base import _fit_context
 from ..utils import check_array
 from ..utils._param_validation import validate_params
 from ..utils.extmath import fast_logdet
@@ -218,6 +219,7 @@ def get_precision(self):
             precision = linalg.pinvh(self.covariance_, check_finite=False)
         return precision
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the maximum likelihood covariance estimator to X.
 
@@ -235,7 +237,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X = self._validate_data(X)
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py
index afe21fa3a02f1..8575cc4f75801 100644
--- a/sklearn/covariance/_graph_lasso.py
+++ b/sklearn/covariance/_graph_lasso.py
@@ -16,6 +16,7 @@
 
 from . import empirical_covariance, EmpiricalCovariance, log_likelihood
 
+from ..base import _fit_context
 from ..exceptions import ConvergenceWarning
 from ..utils.validation import (
     _is_arraylike_not_scalar,
@@ -532,6 +533,7 @@ def __init__(
         self.alpha = alpha
         self.covariance = covariance
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the GraphicalLasso model to X.
 
@@ -548,7 +550,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         # Covariance does not make sense for a single feature
         X = self._validate_data(X, ensure_min_features=2, ensure_min_samples=2)
 
@@ -925,6 +926,7 @@ def __init__(
         self.cv = cv
         self.n_jobs = n_jobs
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the GraphicalLasso covariance model to X.
 
@@ -941,7 +943,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         # Covariance does not make sense for a single feature
         X = self._validate_data(X, ensure_min_features=2)
         if self.assume_centered:
diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py
index f3dd6d60badf8..c723bba7a097b 100644
--- a/sklearn/covariance/_robust_covariance.py
+++ b/sklearn/covariance/_robust_covariance.py
@@ -15,6 +15,7 @@
 from scipy.stats import chi2
 
 from . import empirical_covariance, EmpiricalCovariance
+from ..base import _fit_context
 from ..utils.extmath import fast_logdet
 from ..utils import check_random_state, check_array
 from ..utils._param_validation import Interval
@@ -719,6 +720,7 @@ def __init__(
         self.support_fraction = support_fraction
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit a Minimum Covariance Determinant with the FastMCD algorithm.
 
@@ -736,7 +738,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X = self._validate_data(X, ensure_min_samples=2, estimator="MinCovDet")
         random_state = check_random_state(self.random_state)
         n_samples, n_features = X.shape
diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py
index 4bf3d9a490b6b..21d2e034b45d7 100644
--- a/sklearn/covariance/_shrunk_covariance.py
+++ b/sklearn/covariance/_shrunk_covariance.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 from . import empirical_covariance, EmpiricalCovariance
+from ..base import _fit_context
 from ..utils import check_array
 from ..utils._param_validation import Interval, validate_params
 
@@ -237,6 +238,7 @@ def __init__(self, *, store_precision=True, assume_centered=False, shrinkage=0.1
         )
         self.shrinkage = shrinkage
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the shrunk covariance model to X.
 
@@ -254,7 +256,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X = self._validate_data(X)
         # Not calling the parent object to fit, to avoid a potential
         # matrix inversion when setting the precision
@@ -533,6 +534,7 @@ def __init__(self, *, store_precision=True, assume_centered=False, block_size=10
         )
         self.block_size = block_size
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the Ledoit-Wolf shrunk covariance model to X.
 
@@ -549,7 +551,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         # Not calling the parent object to fit, to avoid computing the
         # covariance matrix (and potentially the precision)
         X = self._validate_data(X)
@@ -722,6 +723,7 @@ class OAS(EmpiricalCovariance):
     0.0195...
     """
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the Oracle Approximating Shrinkage covariance model to X.
 
@@ -738,8 +740,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         X = self._validate_data(X)
         # Not calling the parent object to fit, to avoid computing the
         # covariance matrix (and potentially the precision)
diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index a5e5a1ceff09a..da395d8f060fb 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -16,6 +16,7 @@
 from ..base import BaseEstimator, RegressorMixin, TransformerMixin
 from ..base import MultiOutputMixin
 from ..base import ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..utils import check_array, check_consistent_length
 from ..utils.fixes import sp_version
 from ..utils.fixes import parse_version
@@ -208,6 +209,7 @@ def __init__(
         self.tol = tol
         self.copy = copy
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, Y):
         """Fit model to data.
 
@@ -226,8 +228,6 @@ def fit(self, X, Y):
         self : object
             Fitted model.
         """
-        self._validate_params()
-
         check_consistent_length(X, Y)
         X = self._validate_data(
             X, dtype=np.float64, copy=self.copy, ensure_min_samples=2
@@ -958,6 +958,7 @@ def __init__(self, n_components=2, *, scale=True, copy=True):
         self.scale = scale
         self.copy = copy
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, Y):
         """Fit model to data.
 
@@ -974,8 +975,6 @@ def fit(self, X, Y):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         check_consistent_length(X, Y)
         X = self._validate_data(
             X, dtype=np.float64, copy=self.copy, ensure_min_samples=2
diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py
index 7b2faa4b67f4d..bba06fbb74021 100644
--- a/sklearn/datasets/_arff_parser.py
+++ b/sklearn/datasets/_arff_parser.py
@@ -204,7 +204,10 @@ def _io_to_generator(gzip_file):
         if len(dfs) >= 2:
             dfs[0] = dfs[0].astype(dfs[1].dtypes)
 
-        frame = pd.concat(dfs, ignore_index=True)
+        # liac-arff parser does not depend on NumPy and uses None to represent
+        # missing values. To be consistent with the pandas parser, we replace
+        # None with np.nan.
+        frame = pd.concat(dfs, ignore_index=True).fillna(value=np.nan)
         del dfs, first_df
 
         # cast the columns frame
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 42f64fba2037b..c13b82dd769d3 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -920,9 +920,7 @@ def datasets_missing_values():
         (1119, "liac-arff", 9, 6, 0),
         (1119, "pandas", 9, 0, 6),
         # miceprotein
-        # 1 column has only missing values with object dtype
-        (40966, "liac-arff", 1, 76, 0),
-        # with casting it will be transformed to either float or Int64
+        (40966, "liac-arff", 1, 77, 0),
         (40966, "pandas", 1, 77, 0),
         # titanic
         (40945, "liac-arff", 3, 6, 0),
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index ab2f87de4bb84..54b3590f5b62e 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -1796,6 +1796,7 @@ def fit(self, X, y=None):
         self.fit_transform(X)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None):
         """Fit the model from data in X and return the transformed data.
 
@@ -1813,8 +1814,6 @@ def fit_transform(self, X, y=None):
         V : ndarray of shape (n_samples, n_components)
             Transformed data.
         """
-        self._validate_params()
-
         _check_positive_coding(method=self.fit_algorithm, positive=self.positive_code)
 
         method = "lasso_" + self.fit_algorithm
@@ -2435,6 +2434,7 @@ def fit(self, X, y=None):
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None):
         """Update the model using the data in X as a mini-batch.
 
@@ -2454,9 +2454,6 @@ def partial_fit(self, X, y=None):
         """
         has_components = hasattr(self, "components_")
 
-        if not has_components:
-            self._validate_params()
-
         X = self._validate_data(
             X, dtype=[np.float64, np.float32], order="C", reset=not has_components
         )
diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py
index a6507d167b9cb..8c3d590b2c814 100644
--- a/sklearn/decomposition/_factor_analysis.py
+++ b/sklearn/decomposition/_factor_analysis.py
@@ -27,6 +27,7 @@
 
 
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..utils import check_random_state
 from ..utils._param_validation import Interval, StrOptions
 from ..utils.extmath import fast_logdet, randomized_svd, squared_norm
@@ -197,6 +198,7 @@ def __init__(
         self.random_state = random_state
         self.rotation = rotation
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the FactorAnalysis model to X using SVD based approach.
 
@@ -213,8 +215,6 @@ def fit(self, X, y=None):
         self : object
             FactorAnalysis class instance.
         """
-        self._validate_params()
-
         X = self._validate_data(X, copy=self.copy, dtype=np.float64)
 
         n_samples, n_features = X.shape
diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py
index 680a6cd8bbee1..6dcf62c0ace3b 100644
--- a/sklearn/decomposition/_fastica.py
+++ b/sklearn/decomposition/_fastica.py
@@ -16,6 +16,7 @@
 from scipy import linalg
 
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..exceptions import ConvergenceWarning
 from ..utils import check_array, as_float_array, check_random_state
 from ..utils.validation import check_is_fitted
@@ -672,6 +673,7 @@ def g(x, fun_args):
 
         return S
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None):
         """Fit the model and recover the sources from X.
 
@@ -690,10 +692,9 @@ def fit_transform(self, X, y=None):
             Estimated sources obtained by transforming the data with the
             estimated unmixing matrix.
         """
-        self._validate_params()
-
         return self._fit_transform(X, compute_sources=True)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model to X.
 
@@ -711,8 +712,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         self._fit_transform(X, compute_sources=False)
         return self
 
diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py
index d98a5f4fb3b7a..5ae5d58b06ca4 100644
--- a/sklearn/decomposition/_incremental_pca.py
+++ b/sklearn/decomposition/_incremental_pca.py
@@ -9,6 +9,7 @@
 from scipy import linalg, sparse
 
 from ._base import _BasePCA
+from ..base import _fit_context
 from ..utils import gen_batches
 from ..utils._param_validation import Interval
 from ..utils.extmath import svd_flip, _incremental_mean_and_var
@@ -192,6 +193,7 @@ def __init__(self, n_components=None, *, whiten=False, copy=True, batch_size=Non
         self.copy = copy
         self.batch_size = batch_size
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model with X, using minibatches of size batch_size.
 
@@ -209,8 +211,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         self.components_ = None
         self.n_samples_seen_ = 0
         self.mean_ = 0.0
@@ -243,6 +243,7 @@ def fit(self, X, y=None):
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None, check_input=True):
         """Incremental fit with X. All of X is processed as a single batch.
 
@@ -265,9 +266,6 @@ def partial_fit(self, X, y=None, check_input=True):
         """
         first_pass = not hasattr(self, "components_")
 
-        if first_pass:
-            self._validate_params()
-
         if check_input:
             if sparse.issparse(X):
                 raise TypeError(
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index fadcd6f94a2f8..61d502a006c5e 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -19,6 +19,7 @@
 from ..utils._param_validation import Interval, StrOptions
 from ..exceptions import NotFittedError
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..preprocessing import KernelCenterer
 from ..metrics.pairwise import pairwise_kernels
 
@@ -404,6 +405,7 @@ def _fit_inverse_transform(self, X_transformed, X):
         self.dual_coef_ = linalg.solve(K, X, assume_a="pos", overwrite_a=True)
         self.X_transformed_fit_ = X_transformed
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model from data in X.
 
@@ -421,8 +423,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         if self.fit_inverse_transform and self.kernel == "precomputed":
             raise ValueError("Cannot fit_inverse_transform with a precomputed kernel.")
         X = self._validate_data(X, accept_sparse="csr", copy=self.copy_X)
diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py
index 21829d4fedab3..ab1ea5ebb5460 100644
--- a/sklearn/decomposition/_lda.py
+++ b/sklearn/decomposition/_lda.py
@@ -18,6 +18,7 @@
 from joblib import effective_n_jobs
 
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..utils import check_random_state, gen_batches, gen_even_slices
 from ..utils.validation import check_non_negative
 from ..utils.validation import check_is_fitted
@@ -568,6 +569,7 @@ def _check_non_neg_array(self, X, reset_n_features, whom):
 
         return X
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None):
         """Online VB with Mini-Batch update.
 
@@ -586,9 +588,6 @@ def partial_fit(self, X, y=None):
         """
         first_time = not hasattr(self, "components_")
 
-        if first_time:
-            self._validate_params()
-
         X = self._check_non_neg_array(
             X, reset_n_features=first_time, whom="LatentDirichletAllocation.partial_fit"
         )
@@ -618,6 +617,7 @@ def partial_fit(self, X, y=None):
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Learn model for the data X with variational Bayes method.
 
@@ -637,7 +637,6 @@ def fit(self, X, y=None):
         self
             Fitted estimator.
         """
-        self._validate_params()
         X = self._check_non_neg_array(
             X, reset_n_features=True, whom="LatentDirichletAllocation.fit"
         )
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index 67dd0c2ab7b70..d561583dec205 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -19,6 +19,7 @@
 from ._cdnmf_fast import _update_cdnmf_fast
 from .._config import config_context
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..exceptions import ConvergenceWarning
 from ..utils import check_random_state, check_array, gen_batches
 from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm
@@ -31,6 +32,7 @@
     StrOptions,
     validate_params,
 )
+from ..utils import metadata_routing
 
 
 EPSILON = np.finfo(np.float32).eps
@@ -1122,6 +1124,11 @@ def non_negative_factorization(
 class _BaseNMF(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator, ABC):
     """Base class for NMF and MiniBatchNMF."""
 
+    # This prevents ``set_split_inverse_transform`` to be generated for the
+    # non-standard ``W`` arg on ``inverse_transform``.
+    # TODO: remove when W is removed in v1.5 for inverse_transform
+    __metadata_request__inverse_transform = {"W": metadata_routing.UNUSED}
+
     _parameter_constraints: dict = {
         "n_components": [Interval(Integral, 1, None, closed="left"), None],
         "init": [
@@ -1245,23 +1252,44 @@ def fit(self, X, y=None, **params):
         self.fit_transform(X, **params)
         return self
 
-    def inverse_transform(self, W):
+    def inverse_transform(self, Xt=None, W=None):
         """Transform data back to its original space.
 
         .. versionadded:: 0.18
 
         Parameters
         ----------
-        W : {ndarray, sparse matrix} of shape (n_samples, n_components)
+        Xt : {ndarray, sparse matrix} of shape (n_samples, n_components)
             Transformed data matrix.
 
+        W : deprecated
+            Use `Xt` instead.
+
+            .. deprecated:: 1.3
+
         Returns
         -------
         X : {ndarray, sparse matrix} of shape (n_samples, n_features)
             Returns a data matrix of the original shape.
         """
+        if Xt is None and W is None:
+            raise TypeError("Missing required positional argument: Xt")
+
+        if W is not None and Xt is not None:
+            raise ValueError("Please provide only `Xt`, and not `W`.")
+
+        if W is not None:
+            warnings.warn(
+                (
+                    "Input argument `W` was renamed to `Xt` in v1.3 and will be removed"
+                    " in v1.5."
+                ),
+                FutureWarning,
+            )
+            Xt = W
+
         check_is_fitted(self)
-        return W @ self.components_
+        return Xt @ self.components_
 
     @property
     def _n_features_out(self):
@@ -1539,6 +1567,7 @@ def _check_params(self, X):
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None, W=None, H=None):
         """Learn a NMF model for the data X and returns the transformed data.
 
@@ -1566,8 +1595,6 @@ def fit_transform(self, X, y=None, W=None, H=None):
         W : ndarray of shape (n_samples, n_components)
             Transformed data.
         """
-        self._validate_params()
-
         X = self._validate_data(
             X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32]
         )
@@ -2123,6 +2150,7 @@ def _minibatch_convergence(
 
         return False
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None, W=None, H=None):
         """Learn a NMF model for the data X and returns the transformed data.
 
@@ -2149,8 +2177,6 @@ def fit_transform(self, X, y=None, W=None, H=None):
         W : ndarray of shape (n_samples, n_components)
             Transformed data.
         """
-        self._validate_params()
-
         X = self._validate_data(
             X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32]
         )
@@ -2288,6 +2314,7 @@ def transform(self, X):
 
         return W
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None, W=None, H=None):
         """Update the model using the data in `X` as a mini-batch.
 
@@ -2321,9 +2348,6 @@ def partial_fit(self, X, y=None, W=None, H=None):
         """
         has_components = hasattr(self, "components_")
 
-        if not has_components:
-            self._validate_params()
-
         X = self._validate_data(
             X,
             accept_sparse=("csr", "csc"),
diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
index e8c302fc47129..1d3c0678aca89 100644
--- a/sklearn/decomposition/_pca.py
+++ b/sklearn/decomposition/_pca.py
@@ -20,6 +20,7 @@
 from scipy.sparse.linalg import svds
 
 from ._base import _BasePCA
+from ..base import _fit_context
 from ..utils import check_random_state
 from ..utils._arpack import _init_arpack_v0
 from ..utils.deprecation import deprecated
@@ -414,6 +415,7 @@ def __init__(
     def n_features_(self):
         return self.n_features_in_
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model with X.
 
@@ -431,11 +433,10 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         self._fit(X)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None):
         """Fit the model with X and apply the dimensionality reduction on X.
 
@@ -458,8 +459,6 @@ def fit_transform(self, X, y=None):
         This method returns a Fortran-ordered array. To convert it to a
         C-ordered array, use 'np.ascontiguousarray'.
         """
-        self._validate_params()
-
         U, S, Vt = self._fit(X)
         U = U[:, : self.n_components_]
 
diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py
index 5974b86381e1a..93e4a2164a87f 100644
--- a/sklearn/decomposition/_sparse_pca.py
+++ b/sklearn/decomposition/_sparse_pca.py
@@ -12,6 +12,7 @@
 from ..utils.validation import check_array, check_is_fitted
 from ..linear_model import ridge_regression
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ._dict_learning import dict_learning, MiniBatchDictionaryLearning
 
 
@@ -53,6 +54,7 @@ def __init__(
         self.verbose = verbose
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model from data in X.
 
@@ -70,7 +72,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         random_state = check_random_state(self.random_state)
         X = self._validate_data(X)
 
diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py
index 999266a4f3f78..67f5c73028f15 100644
--- a/sklearn/decomposition/_truncated_svd.py
+++ b/sklearn/decomposition/_truncated_svd.py
@@ -12,6 +12,7 @@
 from scipy.sparse.linalg import svds
 
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..utils import check_array, check_random_state
 from ..utils._arpack import _init_arpack_v0
 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip
@@ -200,10 +201,10 @@ def fit(self, X, y=None):
         self : object
             Returns the transformer object.
         """
-        # param validation is done in fit_transform
         self.fit_transform(X)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None):
         """Fit model to X and perform dimensionality reduction on X.
 
@@ -220,7 +221,6 @@ def fit_transform(self, X, y=None):
         X_new : ndarray of shape (n_samples, n_components)
             Reduced version of X. This will always be a dense array.
         """
-        self._validate_params()
         X = self._validate_data(X, accept_sparse=["csr", "csc"], ensure_min_features=2)
         random_state = check_random_state(self.random_state)
 
diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py
index 74218b83c6952..2b1ed4d91be5e 100644
--- a/sklearn/decomposition/tests/test_nmf.py
+++ b/sklearn/decomposition/tests/test_nmf.py
@@ -1,6 +1,7 @@
 import re
 import sys
 from io import StringIO
+import warnings
 
 import numpy as np
 import scipy.sparse as sp
@@ -906,3 +907,29 @@ def test_minibatch_nmf_verbose():
         nmf.fit(A)
     finally:
         sys.stdout = old_stdout
+
+
+# TODO(1.5): remove this test
+def test_NMF_inverse_transform_W_deprecation():
+    rng = np.random.mtrand.RandomState(42)
+    A = np.abs(rng.randn(6, 5))
+    est = NMF(
+        n_components=3,
+        init="random",
+        random_state=0,
+        tol=1e-6,
+    )
+    Xt = est.fit_transform(A)
+
+    with pytest.raises(TypeError, match="Missing required positional argument"):
+        est.inverse_transform()
+
+    with pytest.raises(ValueError, match="Please provide only"):
+        est.inverse_transform(Xt=Xt, W=Xt)
+
+    with warnings.catch_warnings(record=True):
+        warnings.simplefilter("error")
+        est.inverse_transform(Xt)
+
+    with pytest.warns(FutureWarning, match="Input argument `W` was renamed to `Xt`"):
+        est.inverse_transform(W=Xt)
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index c8c0a656e5784..275f4ae4d3b30 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -17,6 +17,7 @@
 
 from .base import BaseEstimator, TransformerMixin, ClassifierMixin
 from .base import ClassNamePrefixFeaturesOutMixin
+from .base import _fit_context
 from .linear_model._base import LinearClassifierMixin
 from .covariance import ledoit_wolf, empirical_covariance, shrunk_covariance
 from .utils.multiclass import unique_labels
@@ -546,6 +547,10 @@ def _solve_svd(self, X, y):
         self.coef_ = coef @ self.scalings_.T
         self.intercept_ -= self.xbar_ @ self.coef_.T
 
+    @_fit_context(
+        # LinearDiscriminantAnalysis.covariance_estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """Fit the Linear Discriminant Analysis model.
 
@@ -568,8 +573,6 @@ def fit(self, X, y):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         xp, _ = get_namespace(X)
 
         X, y = self._validate_data(
@@ -865,6 +868,7 @@ def __init__(
         self.store_covariance = store_covariance
         self.tol = tol
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit the model according to the given training data and parameters.
 
@@ -889,7 +893,6 @@ def fit(self, X, y):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         X, y = self._validate_data(X, y)
         check_classification_targets(y)
         self.classes_, y = np.unique(y, return_inverse=True)
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index 25f910e8419f4..0d8519484d7a5 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -11,6 +11,7 @@
 
 from .base import BaseEstimator, ClassifierMixin, RegressorMixin
 from .base import MultiOutputMixin
+from .base import _fit_context
 from .utils import check_random_state
 from .utils._param_validation import StrOptions, Interval
 from .utils.validation import _num_samples
@@ -142,6 +143,7 @@ def __init__(self, *, strategy="prior", random_state=None, constant=None):
         self.random_state = random_state
         self.constant = constant
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the baseline classifier.
 
@@ -161,8 +163,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         self._strategy = self.strategy
 
         if self._strategy == "uniform" and sp.issparse(y):
@@ -518,6 +518,7 @@ def __init__(self, *, strategy="mean", constant=None, quantile=None):
         self.constant = constant
         self.quantile = quantile
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the random regressor.
 
@@ -537,8 +538,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         y = check_array(y, ensure_2d=False, input_name="y")
         if len(y) == 0:
             raise ValueError("y must not be empty.")
diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py
index bad6dcfb033ec..0354413fdebfe 100644
--- a/sklearn/ensemble/_bagging.py
+++ b/sklearn/ensemble/_bagging.py
@@ -14,6 +14,7 @@
 
 from ._base import BaseEnsemble, _partition_estimators
 from ..base import ClassifierMixin, RegressorMixin
+from ..base import _fit_context
 from ..metrics import r2_score, accuracy_score
 from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
 from ..utils import check_random_state, column_or_1d
@@ -301,6 +302,10 @@ def __init__(
         self.random_state = random_state
         self.verbose = verbose
 
+    @_fit_context(
+        # BaseBagging.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None):
         """Build a Bagging ensemble of estimators from the training set (X, y).
 
@@ -324,9 +329,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-
-        self._validate_params()
-
         # Convert data (X is required to be 2d and indexable)
         X, y = self._validate_data(
             X,
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 4d9bf862bd806..e715952947c04 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -50,7 +50,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from scipy.sparse import issparse
 from scipy.sparse import hstack as sparse_hstack
 
-from sklearn.base import is_classifier
+from sklearn.base import is_classifier, _fit_context
 from sklearn.base import (
     ClassifierMixin,
     MultiOutputMixin,
@@ -221,6 +221,7 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
             None,
             Interval(Integral, 1, None, closed="left"),
         ],
+        "store_leaf_values": [bool],
     }
 
     @abstractmethod
@@ -240,6 +241,7 @@ def __init__(
         max_samples=None,
         base_estimator="deprecated",
         max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=estimator,
@@ -257,6 +259,7 @@ def __init__(
         self.class_weight = class_weight
         self.max_samples = max_samples
         self.max_bins = max_bins
+        self.store_leaf_values = store_leaf_values
 
     def apply(self, X):
         """
@@ -333,6 +336,7 @@ def decision_path(self, X):
 
         return sparse_hstack(indicators).tocsr(), n_nodes_ptr
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """
         Build a forest of trees from the training set (X, y).
@@ -360,8 +364,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         # Validate or convert input data
         if issparse(y):
             raise ValueError("sparse multilabel-indicator for y is not supported.")
@@ -717,6 +719,139 @@ def _bin_data(self, X, is_training_data):
 
         return X_binned
 
+    def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
+        """Predict class or regression value for X at given quantiles.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+        quantiles : float, optional
+            The quantiles at which to evaluate, by default 0.5 (median).
+        method : str, optional
+            The method to interpolate, by default 'linear'. Can be any keyword
+            argument accepted by :func:`np.quantile`.
+        check_input : bool, optional
+            Whether or not to check input, by default True.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples, n_quantiles) or
+                (n_samples, n_quantiles, n_outputs)
+            The predicted values.
+        """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "Quantile prediction is not available when store_leaf_values=False"
+            )
+        check_is_fitted(self)
+        # Check data
+        X = self._validate_X_predict(X)
+
+        if not isinstance(quantiles, (np.ndarray, list)):
+            quantiles = np.array([quantiles])
+
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
+        # Assign chunk of trees to jobs
+        # n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
+
+        # avoid storing the output of every estimator by summing them here
+        if self.n_outputs_ > 1:
+            y_hat = np.zeros(
+                (X.shape[0], len(quantiles), self.n_outputs_), dtype=np.float64
+            )
+        else:
+            y_hat = np.zeros((X.shape[0], len(quantiles)), dtype=np.float64)
+
+        # get (n_samples, n_estimators) indicator of leaf nodes
+        X_leaves = self.apply(X)
+
+        # we now want to aggregate all leaf samples across all trees for each sample
+        for idx in range(X.shape[0]):
+            # get leaf nodes for this sample
+            leaf_nodes = X_leaves[idx, :]
+
+            # (n_total_leaf_samples, n_outputs)
+            leaf_node_samples = np.vstack(
+                (
+                    est.leaf_nodes_samples_[leaf_nodes[jdx]]
+                    for jdx, est in enumerate(self.estimators_)
+                )
+            )
+
+            # get quantiles across all leaf node samples
+            y_hat[idx, ...] = np.quantile(
+                leaf_node_samples, quantiles, axis=0, interpolation=method
+            )
+
+            if is_classifier(self):
+                if self.n_outputs_ == 1:
+                    for i in range(len(quantiles)):
+                        class_pred_per_sample = y_hat[idx, i, :].squeeze().astype(int)
+                        y_hat[idx, ...] = self.classes_.take(
+                            class_pred_per_sample, axis=0
+                        )
+                else:
+                    for k in range(self.n_outputs_):
+                        for i in range(len(quantiles)):
+                            class_pred_per_sample = (
+                                y_hat[idx, i, k].squeeze().astype(int)
+                            )
+                            y_hat[idx, i, k] = self.classes_[k].take(
+                                class_pred_per_sample, axis=0
+                            )
+        return y_hat
+
+    def get_leaf_node_samples(self, X):
+        """For each datapoint x in X, get the training samples in the leaf node.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Dataset to apply the forest to.
+
+        Returns
+        -------
+        leaf_node_samples : a list of array-like of shape
+                (n_leaf_node_samples, n_outputs)
+            Each sample is represented by the indices of the training samples that
+            reached the leaf node. The ``n_leaf_node_samples`` may vary between
+            samples, since the number of samples that fall in a leaf node is
+            variable.
+        """
+        check_is_fitted(self)
+        # Check data
+        X = self._validate_X_predict(X)
+
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
+        # Assign chunk of trees to jobs
+        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
+
+        # avoid storing the output of every estimator by summing them here
+        result = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
+            delayed(_accumulate_leaf_nodes_samples)(e.get_leaf_node_samples, X)
+            for e in self.estimators_
+        )
+        leaf_nodes_samples = result[0]
+        for result_ in result[1:]:
+            for i, node_samples in enumerate(result_):
+                leaf_nodes_samples[i] = np.vstack((leaf_nodes_samples[i], node_samples))
+        return leaf_nodes_samples
+
 
 def _accumulate_prediction(predict, X, out, lock):
     """
@@ -734,6 +869,17 @@ def _accumulate_prediction(predict, X, out, lock):
                 out[i] += prediction[i]
 
 
+def _accumulate_leaf_nodes_samples(func, X):
+    """
+    This is a utility function for joblib's Parallel.
+
+    It can't go locally in ForestClassifier or ForestRegressor, because joblib
+    complains that it cannot pickle it when placed there.
+    """
+    leaf_nodes_samples = func(X, check_input=False)
+    return leaf_nodes_samples
+
+
 class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta):
     """
     Base class for forest of trees-based classifiers.
@@ -759,6 +905,7 @@ def __init__(
         max_samples=None,
         base_estimator="deprecated",
         max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=estimator,
@@ -774,6 +921,7 @@ def __init__(
             max_samples=max_samples,
             base_estimator=base_estimator,
             max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
     @staticmethod
@@ -1037,6 +1185,7 @@ def __init__(
         max_samples=None,
         base_estimator="deprecated",
         max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator,
@@ -1051,6 +1200,7 @@ def __init__(
             max_samples=max_samples,
             base_estimator=base_estimator,
             max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
     def predict(self, X):
@@ -1515,6 +1665,7 @@ def __init__(
         ccp_alpha=0.0,
         max_samples=None,
         max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=DecisionTreeClassifier(),
@@ -1530,6 +1681,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
             ),
             bootstrap=bootstrap,
             oob_score=oob_score,
@@ -1540,6 +1692,7 @@ def __init__(
             class_weight=class_weight,
             max_samples=max_samples,
             max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -1858,6 +2011,7 @@ def __init__(
         ccp_alpha=0.0,
         max_samples=None,
         max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=DecisionTreeRegressor(),
@@ -1873,6 +2027,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
             ),
             bootstrap=bootstrap,
             oob_score=oob_score,
@@ -1882,6 +2037,7 @@ def __init__(
             warm_start=warm_start,
             max_samples=max_samples,
             max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -2210,6 +2366,7 @@ def __init__(
         ccp_alpha=0.0,
         max_samples=None,
         max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=ExtraTreeClassifier(),
@@ -2225,6 +2382,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
             ),
             bootstrap=bootstrap,
             oob_score=oob_score,
@@ -2235,6 +2393,7 @@ def __init__(
             class_weight=class_weight,
             max_samples=max_samples,
             max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -2534,6 +2693,7 @@ def __init__(
         ccp_alpha=0.0,
         max_samples=None,
         max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=ExtraTreeRegressor(),
@@ -2549,6 +2709,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
             ),
             bootstrap=bootstrap,
             oob_score=oob_score,
@@ -2558,6 +2719,7 @@ def __init__(
             warm_start=warm_start,
             max_samples=max_samples,
             max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -2783,6 +2945,7 @@ def __init__(
         random_state=None,
         verbose=0,
         warm_start=False,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=ExtraTreeRegressor(),
@@ -2797,6 +2960,7 @@ def __init__(
                 "max_leaf_nodes",
                 "min_impurity_decrease",
                 "random_state",
+                "store_leaf_values",
             ),
             bootstrap=False,
             oob_score=False,
@@ -2805,6 +2969,7 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=None,
+            store_leaf_values=store_leaf_values,
         )
 
         self.max_depth = max_depth
@@ -2848,6 +3013,7 @@ def fit(self, X, y=None, sample_weight=None):
         self.fit_transform(X, y, sample_weight=sample_weight)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None, sample_weight=None):
         """
         Fit estimator and transform dataset.
@@ -2873,8 +3039,6 @@ def fit_transform(self, X, y=None, sample_weight=None):
         X_transformed : sparse matrix of shape (n_samples, n_out)
             Transformed dataset.
         """
-        self._validate_params()
-
         rnd = check_random_state(self.random_state)
         y = rnd.uniform(size=_num_samples(X))
         super().fit(X, y, sample_weight=sample_weight)
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index df9904c8a9aa4..8d435873aeb5c 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -28,6 +28,7 @@
 from ._base import BaseEnsemble
 from ..base import ClassifierMixin, RegressorMixin
 from ..base import is_classifier
+from ..base import _fit_context
 
 from ._gradient_boosting import predict_stages
 from ._gradient_boosting import predict_stage
@@ -146,6 +147,7 @@ class BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta):
         "n_iter_no_change": [Interval(Integral, 1, None, closed="left"), None],
         "tol": [Interval(Real, 0.0, None, closed="left")],
     }
+    _parameter_constraints.pop("store_leaf_values")
     _parameter_constraints.pop("splitter")
 
     @abstractmethod
@@ -376,6 +378,10 @@ def _check_initialized(self):
         """Check that the estimator is initialized, raising an error if not."""
         check_is_fitted(self)
 
+    @_fit_context(
+        # GradientBoosting*.init is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None, monitor=None):
         """Fit the gradient boosting model.
 
@@ -412,8 +418,6 @@ def fit(self, X, y, sample_weight=None, monitor=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         if not self.warm_start:
             self._clear_state()
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 976335ea684d0..79b640057abe5 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -18,6 +18,7 @@
     PinballLoss,
 )
 from ...base import BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier
+from ...base import _fit_context
 from ...utils import check_random_state, resample, compute_sample_weight
 from ...utils.validation import (
     check_is_fitted,
@@ -336,6 +337,7 @@ def _check_interaction_cst(self, n_features):
 
         return constraints
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the gradient boosting model.
 
@@ -357,8 +359,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         fit_start_time = time()
         acc_find_split_time = 0.0  # time spent finding the best splits
         acc_apply_split_time = 0.0  # time spent splitting nodes
diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py
index bb016fa33185b..048a1d69395e2 100644
--- a/sklearn/ensemble/_iforest.py
+++ b/sklearn/ensemble/_iforest.py
@@ -20,6 +20,7 @@
 from ..utils._param_validation import RealNotInt
 from ..utils.validation import check_is_fitted, _num_samples
 from ..base import OutlierMixin
+from ..base import _fit_context
 
 from ._bagging import BaseBagging
 
@@ -265,6 +266,7 @@ def _parallel_args(self):
         # copies.
         return {"prefer": "threads"}
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None, sample_weight=None):
         """
         Fit estimator.
@@ -287,7 +289,6 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         X = self._validate_data(X, accept_sparse=["csc"], dtype=tree_dtype)
         if issparse(X):
             # Pre-sort indices to avoid that each individual tree of the
diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py
index 10f7a606f20c9..5b3486edfeb33 100644
--- a/sklearn/ensemble/_stacking.py
+++ b/sklearn/ensemble/_stacking.py
@@ -13,6 +13,7 @@
 from ..base import clone
 from ..base import ClassifierMixin, RegressorMixin, TransformerMixin
 from ..base import is_classifier, is_regressor
+from ..base import _fit_context
 from ..exceptions import NotFittedError
 from ..utils._estimator_html_repr import _VisualBlock
 
@@ -159,6 +160,10 @@ def _method_name(name, estimator, method):
 
         return method_name
 
+    @_fit_context(
+        # estimators in Stacking*.estimators are not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None):
         """Fit the estimators.
 
@@ -184,9 +189,6 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-
-        self._validate_params()
-
         # all_estimators contains all estimators, the one to be fitted and the
         # 'drop' string.
         names, all_estimators = self._validate_estimators()
diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py
index 1c250cbe11a06..f8f4d2c4c197f 100644
--- a/sklearn/ensemble/_voting.py
+++ b/sklearn/ensemble/_voting.py
@@ -22,6 +22,7 @@
 from ..base import RegressorMixin
 from ..base import TransformerMixin
 from ..base import clone
+from ..base import _fit_context
 from ._base import _fit_single_estimator
 from ._base import _BaseHeterogeneousEnsemble
 from ..preprocessing import LabelEncoder
@@ -308,6 +309,10 @@ def __init__(
         self.flatten_transform = flatten_transform
         self.verbose = verbose
 
+    @_fit_context(
+        # estimators in VotingClassifier.estimators are not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None):
         """Fit the estimators.
 
@@ -332,7 +337,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         check_classification_targets(y)
         if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
             raise NotImplementedError(
@@ -572,6 +576,10 @@ def __init__(self, estimators, *, weights=None, n_jobs=None, verbose=False):
         self.n_jobs = n_jobs
         self.verbose = verbose
 
+    @_fit_context(
+        # estimators in VotingRegressor.estimators are not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None):
         """Fit the estimators.
 
@@ -594,7 +602,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         y = column_or_1d(y, warn=True)
         return super().fit(X, y, sample_weight)
 
diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
index b2aff503b0bb0..569609e6326e5 100644
--- a/sklearn/ensemble/_weight_boosting.py
+++ b/sklearn/ensemble/_weight_boosting.py
@@ -34,7 +34,7 @@
 
 from ._base import BaseEnsemble
 from ..base import ClassifierMixin, RegressorMixin, is_classifier, is_regressor
-
+from ..base import _fit_context
 from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
 from ..utils import check_random_state, _safe_indexing
 from ..utils.extmath import softmax
@@ -103,6 +103,10 @@ def _check_X(self, X):
             reset=False,
         )
 
+    @_fit_context(
+        # AdaBoost*.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None):
         """Build a boosted classifier/regressor from the training set (X, y).
 
@@ -124,8 +128,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         X, y = self._validate_data(
             X,
             y,
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index d96f5c76842bf..a78e12a5a5181 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -1984,3 +1984,54 @@ def test_regression_criterion_withbins(name, criterion):
         criterion,
         score,
     )
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
+def test_multioutput_quantiles(name):
+    # Check estimators on multi-output problems.
+    X_train = [
+        [-2, -1],
+        [-1, -1],
+        [-1, -2],
+        [1, 1],
+        [1, 2],
+        [2, 1],
+        [-2, 1],
+        [-1, 1],
+        [-1, 2],
+        [2, -1],
+        [1, -1],
+        [1, -2],
+    ]
+    y_train = [
+        [-1, 0],
+        [-1, 0],
+        [-1, 0],
+        [1, 1],
+        [1, 1],
+        [1, 1],
+        [-1, 2],
+        [-1, 2],
+        [-1, 2],
+        [1, 3],
+        [1, 3],
+        [1, 3],
+    ]
+    X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
+    y_test = [[-1, 0], [1, 1], [-1, 2], [1, 3]]
+
+    est = FOREST_ESTIMATORS[name](
+        random_state=0, bootstrap=False, store_leaf_values=True
+    )
+    est.fit(X_train, y_train)
+
+    y_pred = est.predict_quantiles(X_test, quantiles=[0.25, 0.5, 0.75])
+    assert_array_almost_equal(y_pred[:, 1, :], y_test)
+    assert_array_almost_equal(y_pred[:, 0, :], y_test)
+    assert_array_almost_equal(y_pred[:, 2, :], y_test)
+
+    # test the leaf nodes samples
+    leaf_nodes_samples = est.get_leaf_node_samples(X_test)
+    assert len(leaf_nodes_samples) == len(X_test)
+    for node_samples in leaf_nodes_samples:
+        assert node_samples.shape[1] == est.n_outputs_
diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py
index b51ccceaac9d1..60e2cb3b7ad84 100644
--- a/sklearn/feature_extraction/_dict_vectorizer.py
+++ b/sklearn/feature_extraction/_dict_vectorizer.py
@@ -11,6 +11,7 @@
 import scipy.sparse as sp
 
 from ..base import BaseEstimator, TransformerMixin
+from ..base import _fit_context
 from ..utils import check_array
 from ..utils.validation import check_is_fitted
 
@@ -133,6 +134,7 @@ def _add_iterable_element(
                 indices.append(vocab[feature_name])
                 values.append(self.dtype(vv))
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Learn a list of feature name -> indices mappings.
 
@@ -153,7 +155,6 @@ def fit(self, X, y=None):
         self : object
             DictVectorizer class instance.
         """
-        self._validate_params()
         feature_names = []
         vocab = {}
 
@@ -286,6 +287,7 @@ def _transform(self, X, fitting):
 
         return result_matrix
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None):
         """Learn a list of feature name -> indices mappings and transform X.
 
@@ -309,7 +311,6 @@ def fit_transform(self, X, y=None):
         Xa : {array, sparse matrix}
             Feature vectors; always 2-d.
         """
-        self._validate_params()
         return self._transform(X, fitting=True)
 
     def inverse_transform(self, X, dict_type=dict):
diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py
index 1f2513e70eed5..e1b5e5f2561fe 100644
--- a/sklearn/feature_extraction/_hash.py
+++ b/sklearn/feature_extraction/_hash.py
@@ -8,6 +8,7 @@
 import scipy.sparse as sp
 
 from ..base import BaseEstimator, TransformerMixin
+from ..base import _fit_context
 from ._hashing_fast import transform as _hashing_transform
 from ..utils._param_validation import Interval, StrOptions
 
@@ -121,6 +122,7 @@ def __init__(
         self.n_features = n_features
         self.alternate_sign = alternate_sign
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X=None, y=None):
         """Only validates estimator's parameters.
 
@@ -140,8 +142,6 @@ def fit(self, X=None, y=None):
         self : object
             FeatureHasher class instance.
         """
-        # repeat input validation for grid search (which calls set_params)
-        self._validate_params()
         return self
 
     def transform(self, raw_X):
diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py
index 89bdd7557f583..beea3e23e0adc 100644
--- a/sklearn/feature_extraction/image.py
+++ b/sklearn/feature_extraction/image.py
@@ -16,6 +16,7 @@
 from numpy.lib.stride_tricks import as_strided
 
 from ..base import BaseEstimator, TransformerMixin
+from ..base import _fit_context
 from ..utils import check_array, check_random_state
 from ..utils._param_validation import Hidden, Interval, validate_params
 from ..utils._param_validation import RealNotInt
@@ -561,6 +562,7 @@ def __init__(self, *, patch_size=None, max_patches=None, random_state=None):
         self.max_patches = max_patches
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Only validate the parameters of the estimator.
 
@@ -583,7 +585,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         return self
 
     def transform(self, X):
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 21863d75eff2f..3201e3a0d51bb 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -25,6 +25,7 @@
 import scipy.sparse as sp
 
 from ..base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
+from ..base import _fit_context
 from ..preprocessing import normalize
 from ._hash import FeatureHasher
 from ._stop_words import ENGLISH_STOP_WORDS
@@ -801,6 +802,7 @@ def __init__(
         self.alternate_sign = alternate_sign
         self.dtype = dtype
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None):
         """Only validates estimator's parameters.
 
@@ -820,10 +822,9 @@ def partial_fit(self, X, y=None):
         self : object
             HashingVectorizer instance.
         """
-        # TODO: only validate during the first call
-        self._validate_params()
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Only validates estimator's parameters.
 
@@ -843,8 +844,6 @@ def fit(self, X, y=None):
         self : object
             HashingVectorizer instance.
         """
-        self._validate_params()
-
         # triggers a parameter validation
         if isinstance(X, str):
             raise ValueError(
@@ -1338,6 +1337,7 @@ def fit(self, raw_documents, y=None):
         self.fit_transform(raw_documents)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, raw_documents, y=None):
         """Learn the vocabulary dictionary and return document-term matrix.
 
@@ -1365,7 +1365,6 @@ def fit_transform(self, raw_documents, y=None):
                 "Iterable over raw text documents expected, string object received."
             )
 
-        self._validate_params()
         self._validate_ngram_range()
         self._warn_for_unused_params()
         self._validate_vocabulary()
@@ -1639,6 +1638,7 @@ def __init__(self, *, norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=Fal
         self.smooth_idf = smooth_idf
         self.sublinear_tf = sublinear_tf
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Learn the idf vector (global term weights).
 
@@ -1655,8 +1655,6 @@ def fit(self, X, y=None):
         self : object
             Fitted transformer.
         """
-        self._validate_params()
-
         # large sparse data is not supported for 32bit platforms because
         # _document_frequency uses np.bincount which works on arrays of
         # dtype NPY_INTP which is int32 for 32bit platforms. See #20923
@@ -2073,6 +2071,7 @@ def _check_params(self):
                 UserWarning,
             )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, raw_documents, y=None):
         """Learn vocabulary and idf from training set.
 
@@ -2089,7 +2088,6 @@ def fit(self, raw_documents, y=None):
         self : object
             Fitted vectorizer.
         """
-        self._validate_params()
         self._check_params()
         self._warn_for_unused_params()
         self._tfidf = TfidfTransformer(
diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py
index 7b8de4ae03585..47f98d89e8abe 100644
--- a/sklearn/feature_selection/_from_model.py
+++ b/sklearn/feature_selection/_from_model.py
@@ -9,6 +9,7 @@
 from ._base import SelectorMixin
 from ._base import _get_feature_importances
 from ..base import BaseEstimator, clone, MetaEstimatorMixin
+from ..base import _fit_context
 from ..utils._tags import _safe_tags
 from ..utils.validation import check_is_fitted, check_scalar, _num_features
 from ..utils._param_validation import HasMethods, Interval, Options
@@ -320,6 +321,10 @@ def _check_max_features(self, X):
             )
             self.max_features_ = max_features
 
+    @_fit_context(
+        # SelectFromModel.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None, **fit_params):
         """Fit the SelectFromModel meta-transformer.
 
@@ -340,7 +345,6 @@ def fit(self, X, y=None, **fit_params):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         self._check_max_features(X)
 
         if self.prefit:
@@ -375,6 +379,10 @@ def threshold_(self):
         return _calculate_threshold(self.estimator, scores, self.threshold)
 
     @available_if(_estimator_has("partial_fit"))
+    @_fit_context(
+        # SelectFromModel.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def partial_fit(self, X, y=None, **fit_params):
         """Fit the SelectFromModel meta-transformer only once.
 
@@ -398,7 +406,6 @@ def partial_fit(self, X, y=None, **fit_params):
         first_call = not hasattr(self, "estimator_")
 
         if first_call:
-            self._validate_params()
             self._check_max_features(X)
 
         if self.prefit:
diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
index 214ac9e0c30cf..932d66449ae22 100644
--- a/sklearn/feature_selection/_rfe.py
+++ b/sklearn/feature_selection/_rfe.py
@@ -22,6 +22,7 @@
 from ..base import MetaEstimatorMixin
 from ..base import clone
 from ..base import is_classifier
+from ..base import _fit_context
 from ..model_selection import check_cv
 from ..model_selection._validation import _score
 from ..metrics import check_scoring
@@ -228,6 +229,10 @@ def classes_(self):
         """
         return self.estimator_.classes_
 
+    @_fit_context(
+        # RFE.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, **fit_params):
         """Fit the RFE model and then the underlying estimator on the selected features.
 
@@ -248,7 +253,6 @@ def fit(self, X, y, **fit_params):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         return self._fit(X, y, **fit_params)
 
     def _fit(self, X, y, step_score=None, **fit_params):
@@ -649,6 +653,10 @@ def __init__(
         self.n_jobs = n_jobs
         self.min_features_to_select = min_features_to_select
 
+    @_fit_context(
+        # RFECV.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, groups=None):
         """Fit the RFE model and automatically tune the number of selected features.
 
@@ -674,7 +682,6 @@ def fit(self, X, y, groups=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         tags = self._get_tags()
         X, y = self._validate_data(
             X,
diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py
index 8a61bdee0c554..0fbe91273053b 100644
--- a/sklearn/feature_selection/_sequential.py
+++ b/sklearn/feature_selection/_sequential.py
@@ -7,6 +7,7 @@
 
 from ._base import SelectorMixin
 from ..base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier
+from ..base import _fit_context
 from ..utils._param_validation import HasMethods, Interval, StrOptions
 from ..utils._param_validation import RealNotInt
 from ..utils._tags import _safe_tags
@@ -179,6 +180,10 @@ def __init__(
         self.cv = cv
         self.n_jobs = n_jobs
 
+    @_fit_context(
+        # SequentialFeatureSelector.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None):
         """Learn the features to select from X.
 
@@ -197,8 +202,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         tags = self._get_tags()
         X = self._validate_data(
             X,
diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py
index 18e23d105b8bb..f4355c39f88cd 100644
--- a/sklearn/feature_selection/_univariate_selection.py
+++ b/sklearn/feature_selection/_univariate_selection.py
@@ -13,6 +13,7 @@
 from scipy.sparse import issparse
 
 from ..base import BaseEstimator
+from ..base import _fit_context
 from ..preprocessing import LabelBinarizer
 from ..utils import as_float_array, check_array, check_X_y, safe_sqr, safe_mask
 from ..utils.extmath import safe_sparse_dot, row_norms
@@ -473,6 +474,7 @@ class _BaseFilter(SelectorMixin, BaseEstimator):
     def __init__(self, score_func):
         self.score_func = score_func
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Run score function on (X, y) and get the appropriate features.
 
@@ -490,8 +492,6 @@ def fit(self, X, y):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         X, y = self._validate_data(
             X, y, accept_sparse=["csr", "csc"], multi_output=True
         )
diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py
index 7c8db9cc7fa55..073a22c6ad92b 100644
--- a/sklearn/feature_selection/_variance_threshold.py
+++ b/sklearn/feature_selection/_variance_threshold.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 from ..base import BaseEstimator
+from ..base import _fit_context
 from ._base import SelectorMixin
 from ..utils.sparsefuncs import mean_variance_axis, min_max_axis
 from ..utils.validation import check_is_fitted
@@ -76,6 +77,7 @@ class VarianceThreshold(SelectorMixin, BaseEstimator):
     def __init__(self, threshold=0.0):
         self.threshold = threshold
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Learn empirical variances from X.
 
@@ -94,7 +96,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X = self._validate_data(
             X,
             accept_sparse=("csr", "csc"),
diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py
index 4a88034768870..50a8739372972 100644
--- a/sklearn/gaussian_process/_gpc.py
+++ b/sklearn/gaussian_process/_gpc.py
@@ -13,6 +13,7 @@
 from scipy.special import erf, expit
 
 from ..base import BaseEstimator, ClassifierMixin, clone
+from ..base import _fit_context
 from .kernels import Kernel, RBF, CompoundKernel, ConstantKernel as C
 from ..utils.validation import check_is_fitted
 from ..utils import check_random_state
@@ -679,6 +680,7 @@ def __init__(
         self.multi_class = multi_class
         self.n_jobs = n_jobs
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit Gaussian process classification model.
 
@@ -695,8 +697,6 @@ def fit(self, X, y):
         self : object
             Returns an instance of self.
         """
-        self._validate_params()
-
         if isinstance(self.kernel, CompoundKernel):
             raise ValueError("kernel cannot be a CompoundKernel")
 
diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index 9b7141f71b884..49fcab40c25f8 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -14,6 +14,7 @@
 
 from ..base import BaseEstimator, RegressorMixin, clone
 from ..base import MultiOutputMixin
+from ..base import _fit_context
 from .kernels import Kernel, RBF, ConstantKernel as C
 from ..preprocessing._data import _handle_zeros_in_scale
 from ..utils import check_random_state
@@ -214,6 +215,7 @@ def __init__(
         self.n_targets = n_targets
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit Gaussian process regression model.
 
@@ -230,8 +232,6 @@ def fit(self, X, y):
         self : object
             GaussianProcessRegressor class instance.
         """
-        self._validate_params()
-
         if self.kernel is None:  # Use an RBF kernel as default
             self.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF(
                 1.0, length_scale_bounds="fixed"
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index b2f296c91740e..37fc43731514a 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -11,6 +11,7 @@
 from scipy import sparse as sp
 
 from ..base import BaseEstimator, TransformerMixin
+from ..base import _fit_context
 from ..utils._param_validation import StrOptions, MissingValues
 from ..utils.fixes import _mode
 from ..utils.sparsefuncs import _get_median
@@ -348,6 +349,7 @@ def _validate_input(self, X, in_fit):
 
         return X
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the imputer on `X`.
 
@@ -365,8 +367,6 @@ def fit(self, X, y=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         X = self._validate_input(X, in_fit=True)
 
         # default fill_value is 0 for numerical input and "missing_value"
@@ -927,6 +927,7 @@ def _fit(self, X, y=None, precomputed=False):
 
         return missing_features_info[0]
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the transformer on `X`.
 
@@ -944,7 +945,6 @@ def fit(self, X, y=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         self._fit(X, y)
 
         return self
@@ -990,6 +990,7 @@ def transform(self, X):
 
         return imputer_mask
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None):
         """Generate missing values indicator for `X`.
 
@@ -1008,7 +1009,6 @@ def fit_transform(self, X, y=None):
             The missing indicator for input data. The data type of `Xt`
             will be boolean.
         """
-        self._validate_params()
         imputer_mask = self._fit(X, y)
 
         if self.features_.size < self._n_features:
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
index 41ed19b7a8948..f977e5bc23e6c 100644
--- a/sklearn/impute/_iterative.py
+++ b/sklearn/impute/_iterative.py
@@ -7,6 +7,7 @@
 import numpy as np
 
 from ..base import clone
+from ..base import _fit_context
 from ..exceptions import ConvergenceWarning
 from ..preprocessing import normalize
 from ..utils import (
@@ -627,7 +628,7 @@ def _initial_imputation(self, X, in_fit=False):
                 strategy=self.initial_strategy,
                 fill_value=self.fill_value,
                 keep_empty_features=self.keep_empty_features,
-            )
+            ).set_output(transform="default")
             X_filled = self.initial_imputer_.fit_transform(X)
         else:
             X_filled = self.initial_imputer_.transform(X)
@@ -681,6 +682,10 @@ def _validate_limit(limit, limit_type, n_features):
             )
         return limit
 
+    @_fit_context(
+        # IterativeImputer.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit_transform(self, X, y=None):
         """Fit the imputer on `X` and return the transformed `X`.
 
@@ -698,7 +703,6 @@ def fit_transform(self, X, y=None):
         Xt : array-like, shape (n_samples, n_features)
             The imputed input data.
         """
-        self._validate_params()
         self.random_state_ = getattr(
             self, "random_state_", check_random_state(self.random_state)
         )
diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
index 5735709dd7f29..915f8cbdb3fcb 100644
--- a/sklearn/impute/_knn.py
+++ b/sklearn/impute/_knn.py
@@ -6,6 +6,7 @@
 import numpy as np
 
 from ._base import _BaseImputer
+from ..base import _fit_context
 from ..utils.validation import FLOAT_DTYPES
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import _NAN_METRICS
@@ -199,6 +200,7 @@ def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col):
 
         return np.ma.average(donors, axis=1, weights=weight_matrix).data
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the imputer on X.
 
@@ -216,7 +218,6 @@ def fit(self, X, y=None):
         self : object
             The fitted `KNNImputer` class instance.
         """
-        self._validate_params()
         # Check data integrity and calling arguments
         if not is_scalar_nan(self.missing_values):
             force_all_finite = True
diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py
index aa1521ab697d0..a1cf95b95591b 100644
--- a/sklearn/isotonic.py
+++ b/sklearn/isotonic.py
@@ -11,6 +11,7 @@
 import math
 
 from .base import BaseEstimator, TransformerMixin, RegressorMixin
+from .base import _fit_context
 from .utils import check_array, check_consistent_length
 from .utils.validation import _check_sample_weight, check_is_fitted
 from .utils._param_validation import Interval, StrOptions
@@ -310,6 +311,7 @@ def _build_y(self, X, y, sample_weight, trim_duplicates=True):
             # prediction speed).
             return X, y
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the model using X, y as training data.
 
@@ -338,7 +340,6 @@ def fit(self, X, y, sample_weight=None):
         X is stored for future use, as :meth:`transform` needs X to interpolate
         new input data.
         """
-        self._validate_params()
         check_params = dict(accept_sparse=False, ensure_2d=False)
         X = check_array(
             X, input_name="X", dtype=[np.float64, np.float32], **check_params
diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py
index faa098e634937..7f190a2b66823 100644
--- a/sklearn/kernel_approximation.py
+++ b/sklearn/kernel_approximation.py
@@ -23,6 +23,7 @@
 from .base import BaseEstimator
 from .base import TransformerMixin
 from .base import ClassNamePrefixFeaturesOutMixin
+from .base import _fit_context
 from .utils import check_random_state
 from .utils import deprecated
 from .utils.extmath import safe_sparse_dot
@@ -139,6 +140,7 @@ def __init__(
         self.n_components = n_components
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model with X.
 
@@ -160,8 +162,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         X = self._validate_data(X, accept_sparse="csc")
         random_state = check_random_state(self.random_state)
 
@@ -338,6 +338,7 @@ def __init__(self, *, gamma=1.0, n_components=100, random_state=None):
         self.n_components = n_components
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model with X.
 
@@ -358,8 +359,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         X = self._validate_data(X, accept_sparse="csr")
         random_state = check_random_state(self.random_state)
         n_features = X.shape[1]
@@ -498,6 +497,7 @@ def __init__(self, *, skewedness=1.0, n_components=100, random_state=None):
         self.n_components = n_components
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model with X.
 
@@ -518,7 +518,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X = self._validate_data(X)
         random_state = check_random_state(self.random_state)
         n_features = X.shape[1]
@@ -665,6 +664,7 @@ def __init__(self, *, sample_steps=2, sample_interval=None):
         self.sample_steps = sample_steps
         self.sample_interval = sample_interval
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Only validates estimator's parameters.
 
@@ -686,7 +686,6 @@ def fit(self, X, y=None):
         self : object
             Returns the transformer.
         """
-        self._validate_params()
         X = self._validate_data(X, accept_sparse="csr")
         check_non_negative(X, "X in AdditiveChi2Sampler.fit")
 
@@ -1011,6 +1010,7 @@ def __init__(
         self.random_state = random_state
         self.n_jobs = n_jobs
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit estimator to data.
 
@@ -1032,7 +1032,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X = self._validate_data(X, accept_sparse="csr")
         rnd = check_random_state(self.random_state)
         n_samples = X.shape[0]
diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
index 111e62938f096..a7bfeefaef651 100644
--- a/sklearn/kernel_ridge.py
+++ b/sklearn/kernel_ridge.py
@@ -8,6 +8,7 @@
 import numpy as np
 
 from .base import BaseEstimator, RegressorMixin, MultiOutputMixin
+from .base import _fit_context
 from .utils._param_validation import Interval, StrOptions
 from .metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS, pairwise_kernels
 from .linear_model._ridge import _solve_cholesky_kernel
@@ -170,6 +171,7 @@ def _get_kernel(self, X, Y=None):
     def _more_tags(self):
         return {"pairwise": self.kernel == "precomputed"}
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit Kernel Ridge regression model.
 
@@ -190,8 +192,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         # Convert data
         X, y = self._validate_data(
             X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 06d8664dc013b..92c067c850225 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -28,6 +28,7 @@
 from numbers import Integral
 
 from ..base import BaseEstimator, ClassifierMixin, RegressorMixin, MultiOutputMixin
+from ..base import _fit_context
 from ..preprocessing._data import _is_constant_feature
 from ..utils import check_array
 from ..utils.validation import FLOAT_DTYPES
@@ -642,6 +643,7 @@ def __init__(
         self.n_jobs = n_jobs
         self.positive = positive
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """
         Fit linear model.
@@ -665,9 +667,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted Estimator.
         """
-
-        self._validate_params()
-
         n_jobs_ = self.n_jobs
 
         accept_sparse = False if self.positive else ["csr", "csc", "coo"]
diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py
index 887c6a3ebcbbc..37dc3b81511f5 100644
--- a/sklearn/linear_model/_bayes.py
+++ b/sklearn/linear_model/_bayes.py
@@ -13,6 +13,7 @@
 
 from ._base import LinearModel, _preprocess_data, _rescale_data
 from ..base import RegressorMixin
+from ..base import _fit_context
 from ..utils.extmath import fast_logdet
 from scipy.linalg import pinvh
 from ..utils.validation import _check_sample_weight
@@ -267,6 +268,7 @@ def __init__(
         self.verbose = verbose
         self.n_iter = n_iter
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the model.
 
@@ -288,8 +290,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         max_iter = _deprecate_n_iter(self.n_iter, self.max_iter)
 
         X, y = self._validate_data(X, y, dtype=[np.float64, np.float32], y_numeric=True)
@@ -665,6 +665,7 @@ def __init__(
         self.verbose = verbose
         self.n_iter = n_iter
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit the model according to the given training data and parameters.
 
@@ -683,9 +684,6 @@ def fit(self, X, y):
         self : object
             Fitted estimator.
         """
-
-        self._validate_params()
-
         max_iter = _deprecate_n_iter(self.n_iter, self.max_iter)
 
         X, y = self._validate_data(
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index ea1ee3115ea93..829c0ab6149f1 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -18,6 +18,7 @@
 
 from ._base import LinearModel, _pre_fit
 from ..base import RegressorMixin, MultiOutputMixin
+from ..base import _fit_context
 from ._base import _preprocess_data
 from ..utils import check_array, check_scalar
 from ..utils.validation import check_random_state
@@ -851,6 +852,7 @@ def __init__(
         self.random_state = random_state
         self.selection = selection
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None, check_input=True):
         """Fit model with coordinate descent.
 
@@ -886,8 +888,6 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         To avoid memory re-allocation it is advised to allocate the
         initial data in memory directly using that format.
         """
-        self._validate_params()
-
         if self.alpha == 0:
             warnings.warn(
                 (
@@ -1475,6 +1475,7 @@ def _is_multitask(self):
     def path(X, y, **kwargs):
         """Compute path with coordinate descent."""
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit linear model with coordinate descent.
 
@@ -1502,9 +1503,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns an instance of fitted model.
         """
-
-        self._validate_params()
-
         # This makes sure that there is no duplication in memory.
         # Dealing right with copy_X is important in the following:
         # Multiple functions touch X and subsamples of X and can induce a
@@ -2343,6 +2341,7 @@ def __init__(
         self.random_state = random_state
         self.selection = selection
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit MultiTaskElasticNet model with coordinate descent.
 
@@ -2367,8 +2366,6 @@ def fit(self, X, y):
         To avoid memory re-allocation it is advised to allocate the
         initial data in memory directly using that format.
         """
-        self._validate_params()
-
         # Need to validate separately here.
         # We can't pass multi_output=True because that would allow y to be csr.
         check_X_params = dict(
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index caf37a0f473e0..b1bc460f24dff 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -20,6 +20,7 @@
     HalfTweedieLossIdentity,
 )
 from ...base import BaseEstimator, RegressorMixin
+from ...base import _fit_context
 from ...utils import check_array
 from ...utils._openmp_helpers import _openmp_effective_n_threads
 from ...utils._param_validation import Hidden, Interval, StrOptions
@@ -168,6 +169,7 @@ def __init__(
         self.warm_start = warm_start
         self.verbose = verbose
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit a Generalized Linear Model.
 
@@ -187,8 +189,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted model.
         """
-        self._validate_params()
-
         X, y = self._validate_data(
             X,
             y,
diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py
index a7b848f647560..def2ae273d5c4 100644
--- a/sklearn/linear_model/_huber.py
+++ b/sklearn/linear_model/_huber.py
@@ -7,6 +7,7 @@
 from scipy import optimize
 
 from ..base import BaseEstimator, RegressorMixin
+from ..base import _fit_context
 from ._base import LinearModel
 from ..utils import axis0_safe_slice
 from ..utils._param_validation import Interval
@@ -273,6 +274,7 @@ def __init__(
         self.fit_intercept = fit_intercept
         self.tol = tol
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the model according to the given training data.
 
@@ -293,7 +295,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted `HuberRegressor` estimator.
         """
-        self._validate_params()
         X, y = self._validate_data(
             X,
             y,
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 4be8bb730a0ae..e6c653eb80bb3 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -20,6 +20,7 @@
 from ._base import LinearModel, LinearRegression
 from ._base import _deprecate_normalize, _preprocess_data
 from ..base import RegressorMixin, MultiOutputMixin
+from ..base import _fit_context
 
 # mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs'
 from ..utils import arrayfuncs, as_float_array  # type: ignore
@@ -1097,6 +1098,7 @@ def _fit(self, X, y, max_iter, alpha, fit_path, normalize, Xy=None):
         self._set_intercept(X_offset, y_offset, X_scale)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, Xy=None):
         """Fit the model using X, y as training data.
 
@@ -1118,8 +1120,6 @@ def fit(self, X, y, Xy=None):
         self : object
             Returns an instance of self.
         """
-        self._validate_params()
-
         X, y = self._validate_data(X, y, y_numeric=True, multi_output=True)
 
         _normalize = _deprecate_normalize(
@@ -1691,6 +1691,7 @@ def __init__(
     def _more_tags(self):
         return {"multioutput": False}
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit the model using X, y as training data.
 
@@ -1707,8 +1708,6 @@ def fit(self, X, y):
         self : object
             Returns an instance of self.
         """
-        self._validate_params()
-
         _normalize = _deprecate_normalize(
             self.normalize, estimator_name=self.__class__.__name__
         )
@@ -2216,6 +2215,7 @@ def __init__(
     def _more_tags(self):
         return {"multioutput": False}
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, copy_X=None):
         """Fit the model using X, y as training data.
 
@@ -2237,8 +2237,6 @@ def fit(self, X, y, copy_X=None):
         self : object
             Returns an instance of self.
         """
-        self._validate_params()
-
         _normalize = _deprecate_normalize(
             self.normalize, estimator_name=self.__class__.__name__
         )
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 3db27d9cc3163..30a0f40a0f2fd 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -24,6 +24,7 @@
 from ._linear_loss import LinearModelLoss
 from ._sag import sag_solver
 from ._glm.glm import NewtonCholeskySolver
+from ..base import _fit_context
 from .._loss.loss import HalfBinomialLoss, HalfMultinomialLoss
 from ..preprocessing import LabelEncoder, LabelBinarizer
 from ..svm._base import _fit_liblinear
@@ -1132,6 +1133,7 @@ def __init__(
         self.n_jobs = n_jobs
         self.l1_ratio = l1_ratio
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """
         Fit the model according to the given training data.
@@ -1161,9 +1163,6 @@ def fit(self, X, y, sample_weight=None):
         -----
         The SAGA solver supports both float64 and float32 bit arrays.
         """
-
-        self._validate_params()
-
         solver = _check_solver(self.solver, self.penalty, self.dual)
 
         if self.penalty != "elasticnet" and self.l1_ratio is not None:
@@ -1745,6 +1744,7 @@ def __init__(
         self.random_state = random_state
         self.l1_ratios = l1_ratios
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the model according to the given training data.
 
@@ -1766,9 +1766,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted LogisticRegressionCV estimator.
         """
-
-        self._validate_params()
-
         solver = _check_solver(self.solver, self.penalty, self.dual)
 
         if self.penalty == "elasticnet":
diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py
index b1dc1e352fd62..df451a99417b0 100644
--- a/sklearn/linear_model/_omp.py
+++ b/sklearn/linear_model/_omp.py
@@ -15,6 +15,7 @@
 
 from ._base import LinearModel, _pre_fit, _deprecate_normalize
 from ..base import RegressorMixin, MultiOutputMixin
+from ..base import _fit_context
 from ..utils import as_float_array, check_array
 from ..utils.parallel import delayed, Parallel
 from ..utils._param_validation import Hidden, Interval, StrOptions
@@ -725,6 +726,7 @@ def __init__(
         self.normalize = normalize
         self.precompute = precompute
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit the model using X, y as training data.
 
@@ -741,8 +743,6 @@ def fit(self, X, y):
         self : object
             Returns an instance of self.
         """
-        self._validate_params()
-
         _normalize = _deprecate_normalize(
             self.normalize, estimator_name=self.__class__.__name__
         )
@@ -1042,6 +1042,7 @@ def __init__(
         self.n_jobs = n_jobs
         self.verbose = verbose
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit the model using X, y as training data.
 
@@ -1058,8 +1059,6 @@ def fit(self, X, y):
         self : object
             Returns an instance of self.
         """
-        self._validate_params()
-
         _normalize = _deprecate_normalize(
             self.normalize, estimator_name=self.__class__.__name__
         )
diff --git a/sklearn/linear_model/_passive_aggressive.py b/sklearn/linear_model/_passive_aggressive.py
index 2cacd4f78cc54..a9c81799c8ca3 100644
--- a/sklearn/linear_model/_passive_aggressive.py
+++ b/sklearn/linear_model/_passive_aggressive.py
@@ -5,6 +5,7 @@
 from ._stochastic_gradient import BaseSGDClassifier
 from ._stochastic_gradient import BaseSGDRegressor
 from ._stochastic_gradient import DEFAULT_EPSILON
+from ..base import _fit_context
 from ..utils._param_validation import Interval, StrOptions
 
 
@@ -220,6 +221,7 @@ def __init__(
         self.C = C
         self.loss = loss
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y, classes=None):
         """Fit linear model with Passive Aggressive algorithm.
 
@@ -245,7 +247,6 @@ def partial_fit(self, X, y, classes=None):
             Fitted estimator.
         """
         if not hasattr(self, "classes_"):
-            self._validate_params()
             self._more_validate_params(for_partial_fit=True)
 
             if self.class_weight == "balanced":
@@ -276,6 +277,7 @@ def partial_fit(self, X, y, classes=None):
             intercept_init=None,
         )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, coef_init=None, intercept_init=None):
         """Fit linear model with Passive Aggressive algorithm.
 
@@ -298,7 +300,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         self._more_validate_params()
 
         lr = "pa1" if self.loss == "hinge" else "pa2"
@@ -504,6 +505,7 @@ def __init__(
         self.C = C
         self.loss = loss
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y):
         """Fit linear model with Passive Aggressive algorithm.
 
@@ -521,7 +523,6 @@ def partial_fit(self, X, y):
             Fitted estimator.
         """
         if not hasattr(self, "coef_"):
-            self._validate_params()
             self._more_validate_params(for_partial_fit=True)
 
         lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2"
@@ -538,6 +539,7 @@ def partial_fit(self, X, y):
             intercept_init=None,
         )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, coef_init=None, intercept_init=None):
         """Fit linear model with Passive Aggressive algorithm.
 
@@ -560,7 +562,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         self._more_validate_params()
 
         lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2"
diff --git a/sklearn/linear_model/_quantile.py b/sklearn/linear_model/_quantile.py
index 081e3da5b51b7..b4a5581386a5f 100644
--- a/sklearn/linear_model/_quantile.py
+++ b/sklearn/linear_model/_quantile.py
@@ -9,6 +9,7 @@
 from scipy.optimize import linprog
 
 from ..base import BaseEstimator, RegressorMixin
+from ..base import _fit_context
 from ._base import LinearModel
 from ..exceptions import ConvergenceWarning
 from ..utils import _safe_indexing
@@ -141,6 +142,7 @@ def __init__(
         self.solver = solver
         self.solver_options = solver_options
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the model according to the given training data.
 
@@ -160,7 +162,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns self.
         """
-        self._validate_params()
         X, y = self._validate_data(
             X,
             y,
diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
index 2474a25f07199..1c12ecc13a258 100644
--- a/sklearn/linear_model/_ransac.py
+++ b/sklearn/linear_model/_ransac.py
@@ -9,6 +9,7 @@
 
 from ..base import BaseEstimator, MetaEstimatorMixin, RegressorMixin, clone
 from ..base import MultiOutputMixin
+from ..base import _fit_context
 from ..utils import check_random_state, check_consistent_length
 from ..utils.random import sample_without_replacement
 from ..utils.validation import check_is_fitted, _check_sample_weight
@@ -283,6 +284,10 @@ def __init__(
         self.random_state = random_state
         self.loss = loss
 
+    @_fit_context(
+        # RansacRegressor.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None):
         """Fit estimator using RANSAC algorithm.
 
@@ -313,8 +318,6 @@ def fit(self, X, y, sample_weight=None):
             `is_data_valid` and `is_model_valid` return False for all
             `max_trials` randomly chosen sub-samples.
         """
-        self._validate_params()
-
         # Need to validate separately here. We can't pass multi_output=True
         # because that would allow y to be csr. Delay expensive finiteness
         # check to the estimator's own input validation.
diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index 28ef7cbd43eb7..893b10d1d93ae 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -25,6 +25,7 @@
 from ._base import _preprocess_data, _rescale_data
 from ._sag import sag_solver
 from ..base import MultiOutputMixin, RegressorMixin, is_classifier
+from ..base import _fit_context
 from ..utils.extmath import safe_sparse_dot
 from ..utils.extmath import row_norms
 from ..utils import check_array
@@ -1114,6 +1115,7 @@ def __init__(
             random_state=random_state,
         )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit Ridge regression model.
 
@@ -1134,8 +1136,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver)
         X, y = self._validate_data(
             X,
@@ -1423,6 +1423,7 @@ def __init__(
         )
         self.class_weight = class_weight
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit Ridge classifier model.
 
@@ -1446,8 +1447,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Instance of the estimator.
         """
-        self._validate_params()
-
         X, y, sample_weight, Y = self._prepare_data(X, y, sample_weight, self.solver)
 
         super().fit(X, Y, sample_weight=sample_weight)
@@ -2354,6 +2353,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
     0.5166...
     """
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit Ridge regression model with cv.
 
@@ -2383,8 +2383,6 @@ def fit(self, X, y, sample_weight=None):
         cross-validation takes the sample weights into account when computing
         the validation score.
         """
-        self._validate_params()
-
         super().fit(X, y, sample_weight=sample_weight)
         return self
 
@@ -2533,6 +2531,7 @@ def __init__(
         )
         self.class_weight = class_weight
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit Ridge classifier with cv.
 
@@ -2555,8 +2554,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         # `RidgeClassifier` does not accept "sag" or "saga" solver and thus support
         # csr, csc, and coo sparse matrices. By using solver="eigen" we force to accept
         # all sparse format.
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 2f27bdee7968b..bc8f31016c6f8 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -13,6 +13,7 @@
 from numbers import Integral, Real
 
 from ..base import clone, is_classifier
+from ..base import _fit_context
 from ._base import LinearClassifierMixin, SparseCoefMixin
 from ._base import make_dataset
 from ..base import BaseEstimator, RegressorMixin, OutlierMixin
@@ -805,6 +806,7 @@ def _fit_multiclass(self, X, y, alpha, C, learning_rate, sample_weight, max_iter
                 self._standard_intercept = np.atleast_1d(self.intercept_)
                 self.intercept_ = self._standard_intercept
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y, classes=None, sample_weight=None):
         """Perform one epoch of stochastic gradient descent on given samples.
 
@@ -839,7 +841,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
             Returns an instance of self.
         """
         if not hasattr(self, "classes_"):
-            self._validate_params()
             self._more_validate_params(for_partial_fit=True)
 
             if self.class_weight == "balanced":
@@ -869,6 +870,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
             intercept_init=None,
         )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None):
         """Fit linear model with Stochastic Gradient Descent.
 
@@ -897,7 +899,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None):
         self : object
             Returns an instance of self.
         """
-        self._validate_params()
         self._more_validate_params()
 
         return self._fit(
@@ -1470,6 +1471,7 @@ def _partial_fit(
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y, sample_weight=None):
         """Perform one epoch of stochastic gradient descent on given samples.
 
@@ -1496,7 +1498,6 @@ def partial_fit(self, X, y, sample_weight=None):
             Returns an instance of self.
         """
         if not hasattr(self, "coef_"):
-            self._validate_params()
             self._more_validate_params(for_partial_fit=True)
 
         return self._partial_fit(
@@ -1565,6 +1566,7 @@ def _fit(
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None):
         """Fit linear model with Stochastic Gradient Descent.
 
@@ -1590,7 +1592,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None):
         self : object
             Fitted `SGDRegressor` estimator.
         """
-        self._validate_params()
         self._more_validate_params()
 
         return self._fit(
@@ -2366,6 +2367,7 @@ def _partial_fit(
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None, sample_weight=None):
         """Fit linear One-Class SVM with Stochastic Gradient Descent.
 
@@ -2386,7 +2388,6 @@ def partial_fit(self, X, y=None, sample_weight=None):
             Returns a fitted instance of self.
         """
         if not hasattr(self, "coef_"):
-            self._validate_params()
             self._more_validate_params(for_partial_fit=True)
 
         alpha = self.nu / 2
@@ -2453,6 +2454,7 @@ def _fit(
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None, coef_init=None, offset_init=None, sample_weight=None):
         """Fit linear One-Class SVM with Stochastic Gradient Descent.
 
@@ -2485,7 +2487,6 @@ def fit(self, X, y=None, coef_init=None, offset_init=None, sample_weight=None):
         self : object
             Returns a fitted instance of self.
         """
-        self._validate_params()
         self._more_validate_params()
 
         alpha = self.nu / 2
diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py
index 67d6ca532a8ab..72c2d897681c4 100644
--- a/sklearn/linear_model/_theil_sen.py
+++ b/sklearn/linear_model/_theil_sen.py
@@ -19,6 +19,7 @@
 
 from ._base import LinearModel
 from ..base import RegressorMixin
+from ..base import _fit_context
 from ..utils import check_random_state
 from ..utils._param_validation import Interval
 from ..utils.parallel import delayed, Parallel
@@ -395,6 +396,7 @@ def _check_subparams(self, n_samples, n_features):
 
         return n_subsamples, n_subpopulation
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit linear model.
 
@@ -410,7 +412,6 @@ def fit(self, X, y):
         self : returns an instance of self.
             Fitted `TheilSenRegressor` estimator.
         """
-        self._validate_params()
         random_state = check_random_state(self.random_state)
         X, y = self._validate_data(X, y, y_numeric=True)
         n_samples, n_features = X.shape
diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py
index 92206721aac15..0917ef7d207bc 100644
--- a/sklearn/manifold/_isomap.py
+++ b/sklearn/manifold/_isomap.py
@@ -12,6 +12,7 @@
 from scipy.sparse.csgraph import connected_components
 
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..neighbors import NearestNeighbors, kneighbors_graph
 from ..neighbors import radius_neighbors_graph
 from ..utils.validation import check_is_fitted
@@ -235,7 +236,7 @@ def _fit_transform(self, X):
             tol=self.tol,
             max_iter=self.max_iter,
             n_jobs=self.n_jobs,
-        )
+        ).set_output(transform="default")
 
         if self.n_neighbors is not None:
             nbg = kneighbors_graph(
@@ -332,6 +333,10 @@ def reconstruction_error(self):
         evals = self.kernel_pca_.eigenvalues_
         return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0]
 
+    @_fit_context(
+        # Isomap.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None):
         """Compute the embedding vectors for data X.
 
@@ -350,10 +355,13 @@ def fit(self, X, y=None):
         self : object
             Returns a fitted instance of self.
         """
-        self._validate_params()
         self._fit_transform(X)
         return self
 
+    @_fit_context(
+        # Isomap.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit_transform(self, X, y=None):
         """Fit the model from data in X and transform X.
 
@@ -371,7 +379,6 @@ def fit_transform(self, X, y=None):
         X_new : array-like, shape (n_samples, n_components)
             X transformed in the new space.
         """
-        self._validate_params()
         self._fit_transform(X)
         return self.embedding_
 
diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py
index 10a22b12dfd1d..6f57b0627b8be 100644
--- a/sklearn/manifold/_locally_linear.py
+++ b/sklearn/manifold/_locally_linear.py
@@ -17,6 +17,7 @@
     TransformerMixin,
     _UnstableArchMixin,
     ClassNamePrefixFeaturesOutMixin,
+    _fit_context,
 )
 from ..utils import check_random_state, check_array
 from ..utils._arpack import _init_arpack_v0
@@ -759,6 +760,7 @@ def _fit_transform(self, X):
         )
         self._n_features_out = self.embedding_.shape[1]
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Compute the embedding vectors for data X.
 
@@ -775,10 +777,10 @@ def fit(self, X, y=None):
         self : object
             Fitted `LocallyLinearEmbedding` class instance.
         """
-        self._validate_params()
         self._fit_transform(X)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None):
         """Compute the embedding vectors for data X and transform X.
 
@@ -795,7 +797,6 @@ def fit_transform(self, X, y=None):
         X_new : array-like, shape (n_samples, n_components)
             Returns the instance itself.
         """
-        self._validate_params()
         self._fit_transform(X)
         return self.embedding_
 
diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py
index 7fc46325a1ae1..6b7a818b94ea8 100644
--- a/sklearn/manifold/_mds.py
+++ b/sklearn/manifold/_mds.py
@@ -13,6 +13,7 @@
 import warnings
 
 from ..base import BaseEstimator
+from ..base import _fit_context
 from ..metrics import euclidean_distances
 from ..utils import check_random_state, check_array, check_symmetric
 from ..isotonic import IsotonicRegression
@@ -569,10 +570,10 @@ def fit(self, X, y=None, init=None):
         self : object
             Fitted estimator.
         """
-        # parameter will be validated in `fit_transform` call
         self.fit_transform(X, init=init)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None, init=None):
         """
         Fit the data from `X`, and returns the embedded coordinates.
@@ -597,7 +598,6 @@ def fit_transform(self, X, y=None, init=None):
         X_new : ndarray of shape (n_samples, n_components)
             X transformed in the new space.
         """
-        self._validate_params()
         X = self._validate_data(X)
         if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
             warnings.warn(
diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py
index 8291d8326eb05..af965a1362b8f 100644
--- a/sklearn/manifold/_spectral_embedding.py
+++ b/sklearn/manifold/_spectral_embedding.py
@@ -17,6 +17,7 @@
 from scipy.sparse.csgraph import laplacian as csgraph_laplacian
 
 from ..base import BaseEstimator
+from ..base import _fit_context
 from ..utils import (
     check_array,
     check_random_state,
@@ -652,6 +653,7 @@ def _get_affinity_matrix(self, X, Y=None):
         self.affinity_matrix_ = self.affinity(X)
         return self.affinity_matrix_
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model from data in X.
 
@@ -674,8 +676,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         X = self._validate_data(X, accept_sparse="csr", ensure_min_samples=2)
 
         random_state = check_random_state(self.random_state)
diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
index 6ef6ce999cb08..c372ddcca3c2e 100644
--- a/sklearn/manifold/_t_sne.py
+++ b/sklearn/manifold/_t_sne.py
@@ -17,6 +17,7 @@
 from numbers import Integral, Real
 from ..neighbors import NearestNeighbors
 from ..base import BaseEstimator, ClassNamePrefixFeaturesOutMixin, TransformerMixin
+from ..base import _fit_context
 from ..utils import check_random_state
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils.validation import check_non_negative
@@ -1078,6 +1079,10 @@ def _tsne(
 
         return X_embedded
 
+    @_fit_context(
+        # TSNE.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit_transform(self, X, y=None):
         """Fit X into an embedded space and return that transformed output.
 
@@ -1099,12 +1104,15 @@ def fit_transform(self, X, y=None):
         X_new : ndarray of shape (n_samples, n_components)
             Embedding of the training data in low-dimensional space.
         """
-        self._validate_params()
         self._check_params_vs_input(X)
         embedding = self._fit(X)
         self.embedding_ = embedding
         return self.embedding_
 
+    @_fit_context(
+        # TSNE.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None):
         """Fit X into an embedded space.
 
@@ -1126,7 +1134,6 @@ def fit(self, X, y=None):
         X_new : array of shape (n_samples, n_components)
             Embedding of the training data in low-dimensional space.
         """
-        self._validate_params()
         self.fit_transform(X)
         return self
 
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 67b04e9382acb..dbe5b76f0f4c9 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -919,8 +919,9 @@ def haversine_distances(X, Y=None):
     in radians. The dimension of the data must be 2.
 
     .. math::
-       D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2)
-                                + \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}]
+       D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x_{lat} - y_{lat}) / 2)
+                                + \\cos(x_{lat})\\cos(y_{lat})\\
+                                sin^2((x_{lon} - y_{lon}) / 2)}]
 
     Parameters
     ----------
@@ -1220,6 +1221,13 @@ def paired_cosine_distances(X, Y):
 }
 
 
+@validate_params(
+    {
+        "X": ["array-like"],
+        "Y": ["array-like"],
+        "metric": [StrOptions(set(PAIRED_DISTANCES)), callable],
+    }
+)
 def paired_distances(X, Y, *, metric="euclidean", **kwds):
     """
     Compute the paired distances between X and Y.
@@ -1278,8 +1286,6 @@ def paired_distances(X, Y, *, metric="euclidean", **kwds):
         for i in range(len(X)):
             distances[i] = metric(X[i], Y[i])
         return distances
-    else:
-        raise ValueError("Unknown distance %s" % metric)
 
 
 # Kernels
diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py
index a298dfec6a0da..fbca4f1d49dcd 100644
--- a/sklearn/mixture/_base.py
+++ b/sklearn/mixture/_base.py
@@ -16,6 +16,7 @@
 from ..cluster import kmeans_plusplus
 from ..base import BaseEstimator
 from ..base import DensityMixin
+from ..base import _fit_context
 from ..exceptions import ConvergenceWarning
 from ..utils import check_random_state
 from ..utils.validation import check_is_fitted
@@ -182,6 +183,7 @@ def fit(self, X, y=None):
         self.fit_predict(X, y)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_predict(self, X, y=None):
         """Estimate model parameters using X and predict the labels for X.
 
@@ -209,8 +211,6 @@ def fit_predict(self, X, y=None):
         labels : array, shape (n_samples,)
             Component labels.
         """
-        self._validate_params()
-
         X = self._validate_data(X, dtype=[np.float64, np.float32], ensure_min_samples=2)
         if X.shape[0] < self.n_components:
             raise ValueError(
diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py
index 76dc02e625408..4a3f5d1e239a8 100644
--- a/sklearn/model_selection/__init__.py
+++ b/sklearn/model_selection/__init__.py
@@ -33,6 +33,7 @@
 from ._search import ParameterSampler
 
 from ._plot import LearningCurveDisplay
+from ._plot import ValidationCurveDisplay
 
 if typing.TYPE_CHECKING:
     # Avoid errors in type checkers (e.g. mypy) for experimental estimators.
@@ -74,6 +75,7 @@
     "permutation_test_score",
     "train_test_split",
     "validation_curve",
+    "ValidationCurveDisplay",
 ]
 
 
diff --git a/sklearn/model_selection/_plot.py b/sklearn/model_selection/_plot.py
index 6a6133a722251..bc5a600e57234 100644
--- a/sklearn/model_selection/_plot.py
+++ b/sklearn/model_selection/_plot.py
@@ -1,10 +1,140 @@
+import warnings
+
 import numpy as np
 
-from . import learning_curve
+from . import learning_curve, validation_curve
 from ..utils import check_matplotlib_support
+from ..utils._plotting import _validate_score_name, _interval_max_min_ratio
+
+
+class _BaseCurveDisplay:
+    def _plot_curve(
+        self,
+        x_data,
+        *,
+        ax=None,
+        negate_score=False,
+        score_name=None,
+        score_type="test",
+        log_scale="deprecated",
+        std_display_style="fill_between",
+        line_kw=None,
+        fill_between_kw=None,
+        errorbar_kw=None,
+    ):
+        check_matplotlib_support(f"{self.__class__.__name__}.plot")
+
+        import matplotlib.pyplot as plt
+
+        if ax is None:
+            _, ax = plt.subplots()
+
+        if negate_score:
+            train_scores, test_scores = -self.train_scores, -self.test_scores
+        else:
+            train_scores, test_scores = self.train_scores, self.test_scores
+
+        if std_display_style not in ("errorbar", "fill_between", None):
+            raise ValueError(
+                f"Unknown std_display_style: {std_display_style}. Should be one of"
+                " 'errorbar', 'fill_between', or None."
+            )
+
+        if score_type not in ("test", "train", "both"):
+            raise ValueError(
+                f"Unknown score_type: {score_type}. Should be one of 'test', "
+                "'train', or 'both'."
+            )
+
+        if score_type == "train":
+            scores = {"Train": train_scores}
+        elif score_type == "test":
+            scores = {"Test": test_scores}
+        else:  # score_type == "both"
+            scores = {"Train": train_scores, "Test": test_scores}
+
+        if std_display_style in ("fill_between", None):
+            # plot the mean score
+            if line_kw is None:
+                line_kw = {}
+
+            self.lines_ = []
+            for line_label, score in scores.items():
+                self.lines_.append(
+                    *ax.plot(
+                        x_data,
+                        score.mean(axis=1),
+                        label=line_label,
+                        **line_kw,
+                    )
+                )
+            self.errorbar_ = None
+            self.fill_between_ = None  # overwritten below by fill_between
+
+        if std_display_style == "errorbar":
+            if errorbar_kw is None:
+                errorbar_kw = {}
+
+            self.errorbar_ = []
+            for line_label, score in scores.items():
+                self.errorbar_.append(
+                    ax.errorbar(
+                        x_data,
+                        score.mean(axis=1),
+                        score.std(axis=1),
+                        label=line_label,
+                        **errorbar_kw,
+                    )
+                )
+            self.lines_, self.fill_between_ = None, None
+        elif std_display_style == "fill_between":
+            if fill_between_kw is None:
+                fill_between_kw = {}
+            default_fill_between_kw = {"alpha": 0.5}
+            fill_between_kw = {**default_fill_between_kw, **fill_between_kw}
+
+            self.fill_between_ = []
+            for line_label, score in scores.items():
+                self.fill_between_.append(
+                    ax.fill_between(
+                        x_data,
+                        score.mean(axis=1) - score.std(axis=1),
+                        score.mean(axis=1) + score.std(axis=1),
+                        **fill_between_kw,
+                    )
+                )
+
+        score_name = self.score_name if score_name is None else score_name
+
+        ax.legend()
 
+        # TODO(1.5): to be removed
+        if log_scale != "deprecated":
+            warnings.warn(
+                (
+                    "The `log_scale` parameter is deprecated as of version 1.3 "
+                    "and will be removed in 1.5. You can use display.ax_.set_xscale "
+                    "and display.ax_.set_yscale instead."
+                ),
+                FutureWarning,
+            )
+            xscale = "log" if log_scale else "linear"
+        else:
+            # We found that a ratio, smaller or bigger than 5, between the largest and
+            # smallest gap of the x values is a good indicator to choose between linear
+            # and log scale.
+            if _interval_max_min_ratio(x_data) > 5:
+                xscale = "symlog" if x_data.min() <= 0 else "log"
+            else:
+                xscale = "linear"
+        ax.set_xscale(xscale)
+        ax.set_ylabel(f"{score_name}")
 
-class LearningCurveDisplay:
+        self.ax_ = ax
+        self.figure_ = ax.figure
+
+
+class LearningCurveDisplay(_BaseCurveDisplay):
     """Learning Curve visualization.
 
     It is recommended to use
@@ -12,7 +142,10 @@ class LearningCurveDisplay:
     create a :class:`~sklearn.model_selection.LearningCurveDisplay` instance.
     All parameters are stored as attributes.
 
-    Read more in the :ref:`User Guide <visualizations>`.
+    Read more in the :ref:`User Guide <visualizations>` for general information
+    about the visualization API and
+    :ref:`detailed documentation <learning_curve>` regarding the learning
+    curve visualization.
 
     .. versionadded:: 1.2
 
@@ -29,9 +162,12 @@ class LearningCurveDisplay:
         Scores on test set.
 
     score_name : str, default=None
-        The name of the score used in `learning_curve`. It will be used to
-        decorate the y-axis. If `None`, the generic name `"Score"` will be
-        used.
+        The name of the score used in `learning_curve`. It will override the name
+        inferred from the `scoring` parameter. If `score` is `None`, we use `"Score"` if
+        `negate_score` is `False` and `"Negative score"` otherwise. If `scoring` is a
+        string or a callable, we infer the name. We replace `_` by spaces and capitalize
+        the first letter. We remove `neg_` and replace it by `"Negative"` if
+        `negate_score` is `False` or just remove it otherwise.
 
     Attributes
     ----------
@@ -89,8 +225,8 @@ def plot(
         *,
         negate_score=False,
         score_name=None,
-        score_type="test",
-        log_scale=False,
+        score_type="both",
+        log_scale="deprecated",
         std_display_style="fill_between",
         line_kw=None,
         fill_between_kw=None,
@@ -111,16 +247,25 @@ def plot(
             `scikit-learn`.
 
         score_name : str, default=None
-            The name of the score used to decorate the y-axis of the plot. If
-            `None`, the generic name "Score" will be used.
-
-        score_type : {"test", "train", "both"}, default="test"
+            The name of the score used to decorate the y-axis of the plot. It will
+            override the name inferred from the `scoring` parameter. If `score` is
+            `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"`
+            otherwise. If `scoring` is a string or a callable, we infer the name. We
+            replace `_` by spaces and capitalize the first letter. We remove `neg_` and
+            replace it by `"Negative"` if `negate_score` is
+            `False` or just remove it otherwise.
+
+        score_type : {"test", "train", "both"}, default="both"
             The type of score to plot. Can be one of `"test"`, `"train"`, or
             `"both"`.
 
-        log_scale : bool, default=False
+        log_scale : bool, default="deprecated"
             Whether or not to use a logarithmic scale for the x-axis.
 
+            .. deprecated:: 1.3
+               `log_scale` is deprecated in 1.3 and will be removed in 1.5.
+               Use `display.ax_.set_xscale` and `display.ax_.set_yscale` instead.
+
         std_display_style : {"errorbar", "fill_between"} or None, default="fill_between"
             The style used to display the score standard deviation around the
             mean score. If None, no standard deviation representation is
@@ -143,98 +288,19 @@ def plot(
         display : :class:`~sklearn.model_selection.LearningCurveDisplay`
             Object that stores computed values.
         """
-        check_matplotlib_support(f"{self.__class__.__name__}.plot")
-
-        import matplotlib.pyplot as plt
-
-        if ax is None:
-            _, ax = plt.subplots()
-
-        if negate_score:
-            train_scores, test_scores = -self.train_scores, -self.test_scores
-        else:
-            train_scores, test_scores = self.train_scores, self.test_scores
-
-        if std_display_style not in ("errorbar", "fill_between", None):
-            raise ValueError(
-                f"Unknown std_display_style: {std_display_style}. Should be one of"
-                " 'errorbar', 'fill_between', or None."
-            )
-
-        if score_type not in ("test", "train", "both"):
-            raise ValueError(
-                f"Unknown score_type: {score_type}. Should be one of 'test', "
-                "'train', or 'both'."
-            )
-
-        if score_type == "train":
-            scores = {"Training metric": train_scores}
-        elif score_type == "test":
-            scores = {"Testing metric": test_scores}
-        else:  # score_type == "both"
-            scores = {"Training metric": train_scores, "Testing metric": test_scores}
-
-        if std_display_style in ("fill_between", None):
-            # plot the mean score
-            if line_kw is None:
-                line_kw = {}
-
-            self.lines_ = []
-            for line_label, score in scores.items():
-                self.lines_.append(
-                    *ax.plot(
-                        self.train_sizes,
-                        score.mean(axis=1),
-                        label=line_label,
-                        **line_kw,
-                    )
-                )
-            self.errorbar_ = None
-            self.fill_between_ = None  # overwritten below by fill_between
-
-        if std_display_style == "errorbar":
-            if errorbar_kw is None:
-                errorbar_kw = {}
-
-            self.errorbar_ = []
-            for line_label, score in scores.items():
-                self.errorbar_.append(
-                    ax.errorbar(
-                        self.train_sizes,
-                        score.mean(axis=1),
-                        score.std(axis=1),
-                        label=line_label,
-                        **errorbar_kw,
-                    )
-                )
-            self.lines_, self.fill_between_ = None, None
-        elif std_display_style == "fill_between":
-            if fill_between_kw is None:
-                fill_between_kw = {}
-            default_fill_between_kw = {"alpha": 0.5}
-            fill_between_kw = {**default_fill_between_kw, **fill_between_kw}
-
-            self.fill_between_ = []
-            for line_label, score in scores.items():
-                self.fill_between_.append(
-                    ax.fill_between(
-                        self.train_sizes,
-                        score.mean(axis=1) - score.std(axis=1),
-                        score.mean(axis=1) + score.std(axis=1),
-                        **fill_between_kw,
-                    )
-                )
-
-        score_name = self.score_name if score_name is None else score_name
-
-        ax.legend()
-        if log_scale:
-            ax.set_xscale("log")
-        ax.set_xlabel("Number of samples in the training set")
-        ax.set_ylabel(f"{score_name}")
-
-        self.ax_ = ax
-        self.figure_ = ax.figure
+        self._plot_curve(
+            self.train_sizes,
+            ax=ax,
+            negate_score=negate_score,
+            score_name=score_name,
+            score_type=score_type,
+            log_scale=log_scale,
+            std_display_style=std_display_style,
+            line_kw=line_kw,
+            fill_between_kw=fill_between_kw,
+            errorbar_kw=errorbar_kw,
+        )
+        self.ax_.set_xlabel("Number of samples in the training set")
         return self
 
     @classmethod
@@ -259,8 +325,8 @@ def from_estimator(
         ax=None,
         negate_score=False,
         score_name=None,
-        score_type="test",
-        log_scale=False,
+        score_type="both",
+        log_scale="deprecated",
         std_display_style="fill_between",
         line_kw=None,
         fill_between_kw=None,
@@ -268,6 +334,11 @@ def from_estimator(
     ):
         """Create a learning curve display from an estimator.
 
+        Read more in the :ref:`User Guide <visualizations>` for general
+        information about the visualization API and :ref:`detailed
+        documentation <learning_curve>` regarding the learning curve
+        visualization.
+
         Parameters
         ----------
         estimator : object type that implements the "fit" and "predict" methods
@@ -368,16 +439,25 @@ def from_estimator(
             `scikit-learn`.
 
         score_name : str, default=None
-            The name of the score used to decorate the y-axis of the plot.
-            If `None`, the generic `"Score"` name will be used.
-
-        score_type : {"test", "train", "both"}, default="test"
+            The name of the score used to decorate the y-axis of the plot. It will
+            override the name inferred from the `scoring` parameter. If `score` is
+            `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"`
+            otherwise. If `scoring` is a string or a callable, we infer the name. We
+            replace `_` by spaces and capitalize the first letter. We remove `neg_` and
+            replace it by `"Negative"` if `negate_score` is
+            `False` or just remove it otherwise.
+
+        score_type : {"test", "train", "both"}, default="both"
             The type of score to plot. Can be one of `"test"`, `"train"`, or
             `"both"`.
 
-        log_scale : bool, default=False
+        log_scale : bool, default="deprecated"
             Whether or not to use a logarithmic scale for the x-axis.
 
+            .. deprecated:: 1.3
+               `log_scale` is deprecated in 1.3 and will be removed in 1.5.
+               Use `display.ax_.xscale` and `display.ax_.yscale` instead.
+
         std_display_style : {"errorbar", "fill_between"} or None, default="fill_between"
             The style used to display the score standard deviation around the
             mean score. If `None`, no representation of the standard deviation
@@ -414,7 +494,7 @@ def from_estimator(
         """
         check_matplotlib_support(f"{cls.__name__}.from_estimator")
 
-        score_name = "Score" if score_name is None else score_name
+        score_name = _validate_score_name(score_name, scoring, negate_score)
 
         train_sizes, train_scores, test_scores = learning_curve(
             estimator,
@@ -451,3 +531,377 @@ def from_estimator(
             fill_between_kw=fill_between_kw,
             errorbar_kw=errorbar_kw,
         )
+
+
+class ValidationCurveDisplay(_BaseCurveDisplay):
+    """Validation Curve visualization.
+
+    It is recommended to use
+    :meth:`~sklearn.model_selection.ValidationCurveDisplay.from_estimator` to
+    create a :class:`~sklearn.model_selection.ValidationCurveDisplay` instance.
+    All parameters are stored as attributes.
+
+    Read more in the :ref:`User Guide <visualizations>` for general information
+    about the visualization API and :ref:`detailed documentation
+    <validation_curve>` regarding the validation curve visualization.
+
+    .. versionadded:: 1.3
+
+    Parameters
+    ----------
+    param_name : str
+        Name of the parameter that has been varied.
+
+    param_range : ndarray of shape (n_ticks,)
+        The values of the parameter that have been evaluated.
+
+    train_scores : ndarray of shape (n_ticks, n_cv_folds)
+        Scores on training sets.
+
+    test_scores : ndarray of shape (n_ticks, n_cv_folds)
+        Scores on test set.
+
+    score_name : str, default=None
+        The name of the score used in `validation_curve`. It will override the name
+        inferred from the `scoring` parameter. If `score` is `None`, we use `"Score"` if
+        `negate_score` is `False` and `"Negative score"` otherwise. If `scoring` is a
+        string or a callable, we infer the name. We replace `_` by spaces and capitalize
+        the first letter. We remove `neg_` and replace it by `"Negative"` if
+        `negate_score` is `False` or just remove it otherwise.
+
+    Attributes
+    ----------
+    ax_ : matplotlib Axes
+        Axes with the validation curve.
+
+    figure_ : matplotlib Figure
+        Figure containing the validation curve.
+
+    errorbar_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"errorbar"`, this is a list of
+        `matplotlib.container.ErrorbarContainer` objects. If another style is
+        used, `errorbar_` is `None`.
+
+    lines_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"fill_between"`, this is a list of
+        `matplotlib.lines.Line2D` objects corresponding to the mean train and
+        test scores. If another style is used, `line_` is `None`.
+
+    fill_between_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"fill_between"`, this is a list of
+        `matplotlib.collections.PolyCollection` objects. If another style is
+        used, `fill_between_` is `None`.
+
+    See Also
+    --------
+    sklearn.model_selection.validation_curve : Compute the validation curve.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.model_selection import ValidationCurveDisplay, validation_curve
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> X, y = make_classification(n_samples=1_000, random_state=0)
+    >>> logistic_regression = LogisticRegression()
+    >>> param_name, param_range = "C", np.logspace(-8, 3, 10)
+    >>> train_scores, test_scores = validation_curve(
+    ...     logistic_regression, X, y, param_name=param_name, param_range=param_range
+    ... )
+    >>> display = ValidationCurveDisplay(
+    ...     param_name=param_name, param_range=param_range,
+    ...     train_scores=train_scores, test_scores=test_scores, score_name="Score"
+    ... )
+    >>> display.plot()
+    <...>
+    >>> plt.show()
+    """
+
+    def __init__(
+        self, *, param_name, param_range, train_scores, test_scores, score_name=None
+    ):
+        self.param_name = param_name
+        self.param_range = param_range
+        self.train_scores = train_scores
+        self.test_scores = test_scores
+        self.score_name = score_name
+
+    def plot(
+        self,
+        ax=None,
+        *,
+        negate_score=False,
+        score_name=None,
+        score_type="both",
+        std_display_style="fill_between",
+        line_kw=None,
+        fill_between_kw=None,
+        errorbar_kw=None,
+    ):
+        """Plot visualization.
+
+        Parameters
+        ----------
+        ax : matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        negate_score : bool, default=False
+            Whether or not to negate the scores obtained through
+            :func:`~sklearn.model_selection.validation_curve`. This is
+            particularly useful when using the error denoted by `neg_*` in
+            `scikit-learn`.
+
+        score_name : str, default=None
+            The name of the score used to decorate the y-axis of the plot. It will
+            override the name inferred from the `scoring` parameter. If `score` is
+            `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"`
+            otherwise. If `scoring` is a string or a callable, we infer the name. We
+            replace `_` by spaces and capitalize the first letter. We remove `neg_` and
+            replace it by `"Negative"` if `negate_score` is
+            `False` or just remove it otherwise.
+
+        score_type : {"test", "train", "both"}, default="both"
+            The type of score to plot. Can be one of `"test"`, `"train"`, or
+            `"both"`.
+
+        std_display_style : {"errorbar", "fill_between"} or None, default="fill_between"
+            The style used to display the score standard deviation around the
+            mean score. If None, no standard deviation representation is
+            displayed.
+
+        line_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.plot` used to draw
+            the mean score.
+
+        fill_between_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.fill_between` used
+            to draw the score standard deviation.
+
+        errorbar_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.errorbar` used to
+            draw mean score and standard deviation score.
+
+        Returns
+        -------
+        display : :class:`~sklearn.model_selection.ValidationCurveDisplay`
+            Object that stores computed values.
+        """
+        self._plot_curve(
+            self.param_range,
+            ax=ax,
+            negate_score=negate_score,
+            score_name=score_name,
+            score_type=score_type,
+            log_scale="deprecated",
+            std_display_style=std_display_style,
+            line_kw=line_kw,
+            fill_between_kw=fill_between_kw,
+            errorbar_kw=errorbar_kw,
+        )
+        self.ax_.set_xlabel(f"{self.param_name}")
+        return self
+
+    @classmethod
+    def from_estimator(
+        cls,
+        estimator,
+        X,
+        y,
+        *,
+        param_name,
+        param_range,
+        groups=None,
+        cv=None,
+        scoring=None,
+        n_jobs=None,
+        pre_dispatch="all",
+        verbose=0,
+        error_score=np.nan,
+        fit_params=None,
+        ax=None,
+        negate_score=False,
+        score_name=None,
+        score_type="both",
+        std_display_style="fill_between",
+        line_kw=None,
+        fill_between_kw=None,
+        errorbar_kw=None,
+    ):
+        """Create a validation curve display from an estimator.
+
+        Read more in the :ref:`User Guide <visualizations>` for general
+        information about the visualization API and :ref:`detailed
+        documentation <validation_curve>` regarding the validation curve
+        visualization.
+
+        Parameters
+        ----------
+        estimator : object type that implements the "fit" and "predict" methods
+            An object of that type which is cloned for each validation.
+
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        param_name : str
+            Name of the parameter that will be varied.
+
+        param_range : array-like of shape (n_values,)
+            The values of the parameter that will be evaluated.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set. Only used in conjunction with a "Group" :term:`cv`
+            instance (e.g., :class:`GroupKFold`).
+
+        cv : int, cross-validation generator or an iterable, default=None
+            Determines the cross-validation splitting strategy.
+            Possible inputs for cv are:
+
+            - None, to use the default 5-fold cross validation,
+            - int, to specify the number of folds in a `(Stratified)KFold`,
+            - :term:`CV splitter`,
+            - An iterable yielding (train, test) splits as arrays of indices.
+
+            For int/None inputs, if the estimator is a classifier and `y` is
+            either binary or multiclass,
+            :class:`~sklearn.model_selection.StratifiedKFold` is used. In all
+            other cases, :class:`~sklearn.model_selectionKFold` is used. These
+            splitters are instantiated with `shuffle=False` so the splits will
+            be the same across calls.
+
+            Refer :ref:`User Guide <cross_validation>` for the various
+            cross-validation strategies that can be used here.
+
+        scoring : str or callable, default=None
+            A string (see :ref:`scoring_parameter`) or
+            a scorer callable object / function with signature
+            `scorer(estimator, X, y)` (see :ref:`scoring`).
+
+        n_jobs : int, default=None
+            Number of jobs to run in parallel. Training the estimator and
+            computing the score are parallelized over the different training
+            and test sets. `None` means 1 unless in a
+            :obj:`joblib.parallel_backend` context. `-1` means using all
+            processors. See :term:`Glossary <n_jobs>` for more details.
+
+        pre_dispatch : int or str, default='all'
+            Number of predispatched jobs for parallel execution (default is
+            all). The option can reduce the allocated memory. The str can
+            be an expression like '2*n_jobs'.
+
+        verbose : int, default=0
+            Controls the verbosity: the higher, the more messages.
+
+        error_score : 'raise' or numeric, default=np.nan
+            Value to assign to the score if an error occurs in estimator
+            fitting. If set to 'raise', the error is raised. If a numeric value
+            is given, FitFailedWarning is raised.
+
+        fit_params : dict, default=None
+            Parameters to pass to the fit method of the estimator.
+
+        ax : matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        negate_score : bool, default=False
+            Whether or not to negate the scores obtained through
+            :func:`~sklearn.model_selection.validation_curve`. This is
+            particularly useful when using the error denoted by `neg_*` in
+            `scikit-learn`.
+
+        score_name : str, default=None
+            The name of the score used to decorate the y-axis of the plot. It will
+            override the name inferred from the `scoring` parameter. If `score` is
+            `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"`
+            otherwise. If `scoring` is a string or a callable, we infer the name. We
+            replace `_` by spaces and capitalize the first letter. We remove `neg_` and
+            replace it by `"Negative"` if `negate_score` is
+            `False` or just remove it otherwise.
+
+        score_type : {"test", "train", "both"}, default="both"
+            The type of score to plot. Can be one of `"test"`, `"train"`, or
+            `"both"`.
+
+        std_display_style : {"errorbar", "fill_between"} or None, default="fill_between"
+            The style used to display the score standard deviation around the
+            mean score. If `None`, no representation of the standard deviation
+            is displayed.
+
+        line_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.plot` used to draw
+            the mean score.
+
+        fill_between_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.fill_between` used
+            to draw the score standard deviation.
+
+        errorbar_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.errorbar` used to
+            draw mean score and standard deviation score.
+
+        Returns
+        -------
+        display : :class:`~sklearn.model_selection.ValidationCurveDisplay`
+            Object that stores computed values.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.model_selection import ValidationCurveDisplay
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> X, y = make_classification(n_samples=1_000, random_state=0)
+        >>> logistic_regression = LogisticRegression()
+        >>> param_name, param_range = "C", np.logspace(-8, 3, 10)
+        >>> ValidationCurveDisplay.from_estimator(
+        ...     logistic_regression, X, y, param_name=param_name,
+        ...     param_range=param_range,
+        ... )
+        <...>
+        >>> plt.show()
+        """
+        check_matplotlib_support(f"{cls.__name__}.from_estimator")
+
+        score_name = _validate_score_name(score_name, scoring, negate_score)
+
+        train_scores, test_scores = validation_curve(
+            estimator,
+            X,
+            y,
+            param_name=param_name,
+            param_range=param_range,
+            groups=groups,
+            cv=cv,
+            scoring=scoring,
+            n_jobs=n_jobs,
+            pre_dispatch=pre_dispatch,
+            verbose=verbose,
+            error_score=error_score,
+            fit_params=fit_params,
+        )
+
+        viz = cls(
+            param_name=param_name,
+            param_range=param_range,
+            train_scores=train_scores,
+            test_scores=test_scores,
+            score_name=score_name,
+        )
+        return viz.plot(
+            ax=ax,
+            negate_score=negate_score,
+            score_type=score_type,
+            std_display_style=std_display_style,
+            line_kw=line_kw,
+            fill_between_kw=fill_between_kw,
+            errorbar_kw=errorbar_kw,
+        )
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 1621dd324f81c..695614f4e1fa0 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -26,6 +26,7 @@
 
 from ..base import BaseEstimator, is_classifier, clone
 from ..base import MetaEstimatorMixin
+from ..base import _fit_context
 from ._split import check_cv
 from ._validation import _fit_and_score
 from ._validation import _aggregate_score_dicts
@@ -753,6 +754,10 @@ def _select_best_index(refit, refit_metric, results):
             best_index = results[f"rank_test_{refit_metric}"].argmin()
         return best_index
 
+    @_fit_context(
+        # *SearchCV.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None, *, groups=None, **fit_params):
         """Run fit with all sets of parameters.
 
@@ -786,7 +791,6 @@ def fit(self, X, y=None, *, groups=None, **fit_params):
         self : object
             Instance of fitted estimator.
         """
-        self._validate_params()
         estimator = self.estimator
         refit_metric = "score"
 
diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py
index 4826e7931d4d6..a061d7283b46d 100644
--- a/sklearn/model_selection/_search_successive_halving.py
+++ b/sklearn/model_selection/_search_successive_halving.py
@@ -7,6 +7,7 @@
 from ._search import BaseSearchCV
 from . import ParameterGrid, ParameterSampler
 from ..base import is_classifier
+from ..base import _fit_context
 from ._split import check_cv, _yields_constant_splits
 from ..metrics._scorer import get_scorer_names
 from ..utils import resample
@@ -211,6 +212,10 @@ def _select_best_index(refit, refit_metric, results):
 
         return last_iter_indices[best_idx]
 
+    @_fit_context(
+        # Halving*SearchCV.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None, groups=None, **fit_params):
         """Run fit with all sets of parameters.
 
@@ -238,7 +243,6 @@ def fit(self, X, y=None, groups=None, **fit_params):
         self : object
             Instance of fitted estimator.
         """
-        self._validate_params()
         self._checked_cv_orig = check_cv(
             self.cv, y, classifier=is_classifier(self.estimator)
         )
diff --git a/sklearn/model_selection/tests/test_plot.py b/sklearn/model_selection/tests/test_plot.py
index 762af8fe08336..6baa211d2dc6e 100644
--- a/sklearn/model_selection/tests/test_plot.py
+++ b/sklearn/model_selection/tests/test_plot.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pytest
 
 from sklearn.datasets import load_iris
@@ -5,8 +6,8 @@
 from sklearn.utils import shuffle
 from sklearn.utils._testing import assert_allclose, assert_array_equal
 
-from sklearn.model_selection import learning_curve
-from sklearn.model_selection import LearningCurveDisplay
+from sklearn.model_selection import learning_curve, validation_curve
+from sklearn.model_selection import LearningCurveDisplay, ValidationCurveDisplay
 
 
 @pytest.fixture
@@ -21,18 +22,22 @@ def data():
         ({"score_type": "invalid"}, ValueError, "Unknown score_type:"),
     ],
 )
-def test_learning_curve_display_parameters_validation(
-    pyplot, data, params, err_type, err_msg
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_parameters_validation(
+    pyplot, data, params, err_type, err_msg, CurveDisplay, specific_params
 ):
     """Check that we raise a proper error when passing invalid parameters."""
     X, y = data
     estimator = DecisionTreeClassifier(random_state=0)
 
-    train_sizes = [0.3, 0.6, 0.9]
     with pytest.raises(err_type, match=err_msg):
-        LearningCurveDisplay.from_estimator(
-            estimator, X, y, train_sizes=train_sizes, **params
-        )
+        CurveDisplay.from_estimator(estimator, X, y, **specific_params, **params)
 
 
 def test_learning_curve_display_default_usage(pyplot, data):
@@ -63,7 +68,7 @@ def test_learning_curve_display_default_usage(pyplot, data):
     assert display.ax_.get_ylabel() == "Score"
 
     _, legend_labels = display.ax_.get_legend_handles_labels()
-    assert legend_labels == ["Testing metric"]
+    assert legend_labels == ["Train", "Test"]
 
     train_sizes_abs, train_scores, test_scores = learning_curve(
         estimator, X, y, train_sizes=train_sizes
@@ -74,21 +79,63 @@ def test_learning_curve_display_default_usage(pyplot, data):
     assert_allclose(display.test_scores, test_scores)
 
 
-def test_learning_curve_display_negate_score(pyplot, data):
+def test_validation_curve_display_default_usage(pyplot, data):
+    """Check the default usage of the ValidationCurveDisplay class."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    param_name, param_range = "max_depth", [1, 3, 5]
+    display = ValidationCurveDisplay.from_estimator(
+        estimator, X, y, param_name=param_name, param_range=param_range
+    )
+
+    import matplotlib as mpl
+
+    assert display.errorbar_ is None
+
+    assert isinstance(display.lines_, list)
+    for line in display.lines_:
+        assert isinstance(line, mpl.lines.Line2D)
+
+    assert isinstance(display.fill_between_, list)
+    for fill in display.fill_between_:
+        assert isinstance(fill, mpl.collections.PolyCollection)
+        assert fill.get_alpha() == 0.5
+
+    assert display.score_name == "Score"
+    assert display.ax_.get_xlabel() == f"{param_name}"
+    assert display.ax_.get_ylabel() == "Score"
+
+    _, legend_labels = display.ax_.get_legend_handles_labels()
+    assert legend_labels == ["Train", "Test"]
+
+    train_scores, test_scores = validation_curve(
+        estimator, X, y, param_name=param_name, param_range=param_range
+    )
+
+    assert display.param_range == param_range
+    assert_array_equal(display.param_range, param_range)
+    assert_allclose(display.train_scores, train_scores)
+    assert_allclose(display.test_scores, test_scores)
+
+
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_negate_score(pyplot, data, CurveDisplay, specific_params):
     """Check the behaviour of the `negate_score` parameter calling `from_estimator` and
     `plot`.
     """
     X, y = data
     estimator = DecisionTreeClassifier(max_depth=1, random_state=0)
 
-    train_sizes = [0.3, 0.6, 0.9]
     negate_score = False
-    display = LearningCurveDisplay.from_estimator(
-        estimator,
-        X,
-        y,
-        train_sizes=train_sizes,
-        negate_score=negate_score,
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, negate_score=negate_score
     )
 
     positive_scores = display.lines_[0].get_data()[1]
@@ -96,22 +143,18 @@ def test_learning_curve_display_negate_score(pyplot, data):
     assert display.ax_.get_ylabel() == "Score"
 
     negate_score = True
-    display = LearningCurveDisplay.from_estimator(
-        estimator, X, y, train_sizes=train_sizes, negate_score=negate_score
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, negate_score=negate_score
     )
 
     negative_scores = display.lines_[0].get_data()[1]
     assert (negative_scores <= 0).all()
     assert_allclose(negative_scores, -positive_scores)
-    assert display.ax_.get_ylabel() == "Score"
+    assert display.ax_.get_ylabel() == "Negative score"
 
     negate_score = False
-    display = LearningCurveDisplay.from_estimator(
-        estimator,
-        X,
-        y,
-        train_sizes=train_sizes,
-        negate_score=negate_score,
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, negate_score=negate_score
     )
     assert display.ax_.get_ylabel() == "Score"
     display.plot(negate_score=not negate_score)
@@ -122,23 +165,30 @@ def test_learning_curve_display_negate_score(pyplot, data):
 @pytest.mark.parametrize(
     "score_name, ylabel", [(None, "Score"), ("Accuracy", "Accuracy")]
 )
-def test_learning_curve_display_score_name(pyplot, data, score_name, ylabel):
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_score_name(
+    pyplot, data, score_name, ylabel, CurveDisplay, specific_params
+):
     """Check that we can overwrite the default score name shown on the y-axis."""
     X, y = data
     estimator = DecisionTreeClassifier(random_state=0)
 
-    train_sizes = [0.3, 0.6, 0.9]
-    display = LearningCurveDisplay.from_estimator(
-        estimator, X, y, train_sizes=train_sizes, score_name=score_name
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, score_name=score_name
     )
 
     assert display.ax_.get_ylabel() == ylabel
     X, y = data
     estimator = DecisionTreeClassifier(max_depth=1, random_state=0)
 
-    train_sizes = [0.3, 0.6, 0.9]
-    display = LearningCurveDisplay.from_estimator(
-        estimator, X, y, train_sizes=train_sizes, score_name=score_name
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, score_name=score_name
     )
 
     assert display.score_name == ylabel
@@ -166,7 +216,7 @@ def test_learning_curve_display_score_type(pyplot, data, std_display_style):
     )
 
     _, legend_label = display.ax_.get_legend_handles_labels()
-    assert legend_label == ["Training metric"]
+    assert legend_label == ["Train"]
 
     if std_display_style is None:
         assert len(display.lines_) == 1
@@ -191,7 +241,7 @@ def test_learning_curve_display_score_type(pyplot, data, std_display_style):
     )
 
     _, legend_label = display.ax_.get_legend_handles_labels()
-    assert legend_label == ["Testing metric"]
+    assert legend_label == ["Test"]
 
     if std_display_style is None:
         assert len(display.lines_) == 1
@@ -216,7 +266,7 @@ def test_learning_curve_display_score_type(pyplot, data, std_display_style):
     )
 
     _, legend_label = display.ax_.get_legend_handles_labels()
-    assert legend_label == ["Training metric", "Testing metric"]
+    assert legend_label == ["Train", "Test"]
 
     if std_display_style is None:
         assert len(display.lines_) == 2
@@ -235,100 +285,220 @@ def test_learning_curve_display_score_type(pyplot, data, std_display_style):
     assert_allclose(y_data_test, test_scores.mean(axis=1))
 
 
-def test_learning_curve_display_log_scale(pyplot, data):
-    """Check the behaviour of the parameter `log_scale`."""
+@pytest.mark.parametrize("std_display_style", (None, "errorbar"))
+def test_validation_curve_display_score_type(pyplot, data, std_display_style):
+    """Check the behaviour of setting the `score_type` parameter."""
     X, y = data
     estimator = DecisionTreeClassifier(random_state=0)
 
-    train_sizes = [0.3, 0.6, 0.9]
-    display = LearningCurveDisplay.from_estimator(
-        estimator, X, y, train_sizes=train_sizes, log_scale=True
+    param_name, param_range = "max_depth", [1, 3, 5]
+    train_scores, test_scores = validation_curve(
+        estimator, X, y, param_name=param_name, param_range=param_range
     )
 
-    assert display.ax_.get_xscale() == "log"
-    assert display.ax_.get_yscale() == "linear"
+    score_type = "train"
+    display = ValidationCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        param_name=param_name,
+        param_range=param_range,
+        score_type=score_type,
+        std_display_style=std_display_style,
+    )
 
-    display = LearningCurveDisplay.from_estimator(
-        estimator, X, y, train_sizes=train_sizes, log_scale=False
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Train"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 1
+        assert display.errorbar_ is None
+        x_data, y_data = display.lines_[0].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 1
+        x_data, y_data = display.errorbar_[0].lines[0].get_data()
+
+    assert_array_equal(x_data, param_range)
+    assert_allclose(y_data, train_scores.mean(axis=1))
+
+    score_type = "test"
+    display = ValidationCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        param_name=param_name,
+        param_range=param_range,
+        score_type=score_type,
+        std_display_style=std_display_style,
     )
 
-    assert display.ax_.get_xscale() == "linear"
-    assert display.ax_.get_yscale() == "linear"
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Test"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 1
+        assert display.errorbar_ is None
+        x_data, y_data = display.lines_[0].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 1
+        x_data, y_data = display.errorbar_[0].lines[0].get_data()
+
+    assert_array_equal(x_data, param_range)
+    assert_allclose(y_data, test_scores.mean(axis=1))
+
+    score_type = "both"
+    display = ValidationCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        param_name=param_name,
+        param_range=param_range,
+        score_type=score_type,
+        std_display_style=std_display_style,
+    )
+
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Train", "Test"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 2
+        assert display.errorbar_ is None
+        x_data_train, y_data_train = display.lines_[0].get_data()
+        x_data_test, y_data_test = display.lines_[1].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 2
+        x_data_train, y_data_train = display.errorbar_[0].lines[0].get_data()
+        x_data_test, y_data_test = display.errorbar_[1].lines[0].get_data()
+
+    assert_array_equal(x_data_train, param_range)
+    assert_allclose(y_data_train, train_scores.mean(axis=1))
+    assert_array_equal(x_data_test, param_range)
+    assert_allclose(y_data_test, test_scores.mean(axis=1))
+
+
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params, expected_xscale",
+    [
+        (
+            ValidationCurveDisplay,
+            {"param_name": "max_depth", "param_range": np.arange(1, 5)},
+            "linear",
+        ),
+        (LearningCurveDisplay, {"train_sizes": np.linspace(0.1, 0.9, num=5)}, "linear"),
+        (
+            ValidationCurveDisplay,
+            {
+                "param_name": "max_depth",
+                "param_range": np.round(np.logspace(0, 2, num=5)).astype(np.int64),
+            },
+            "log",
+        ),
+        (LearningCurveDisplay, {"train_sizes": np.logspace(-1, 0, num=5)}, "log"),
+    ],
+)
+def test_curve_display_xscale_auto(
+    pyplot, data, CurveDisplay, specific_params, expected_xscale
+):
+    """Check the behaviour of the x-axis scaling depending on the data provided."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
 
+    display = CurveDisplay.from_estimator(estimator, X, y, **specific_params)
+    assert display.ax_.get_xscale() == expected_xscale
 
-def test_learning_curve_display_std_display_style(pyplot, data):
+
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_std_display_style(pyplot, data, CurveDisplay, specific_params):
     """Check the behaviour of the parameter `std_display_style`."""
     X, y = data
     estimator = DecisionTreeClassifier(random_state=0)
 
     import matplotlib as mpl
 
-    train_sizes = [0.3, 0.6, 0.9]
     std_display_style = None
-    display = LearningCurveDisplay.from_estimator(
+    display = CurveDisplay.from_estimator(
         estimator,
         X,
         y,
-        train_sizes=train_sizes,
+        **specific_params,
         std_display_style=std_display_style,
     )
 
-    assert len(display.lines_) == 1
-    assert isinstance(display.lines_[0], mpl.lines.Line2D)
+    assert len(display.lines_) == 2
+    for line in display.lines_:
+        assert isinstance(line, mpl.lines.Line2D)
     assert display.errorbar_ is None
     assert display.fill_between_ is None
     _, legend_label = display.ax_.get_legend_handles_labels()
-    assert len(legend_label) == 1
+    assert len(legend_label) == 2
 
     std_display_style = "fill_between"
-    display = LearningCurveDisplay.from_estimator(
+    display = CurveDisplay.from_estimator(
         estimator,
         X,
         y,
-        train_sizes=train_sizes,
+        **specific_params,
         std_display_style=std_display_style,
     )
 
-    assert len(display.lines_) == 1
-    assert isinstance(display.lines_[0], mpl.lines.Line2D)
+    assert len(display.lines_) == 2
+    for line in display.lines_:
+        assert isinstance(line, mpl.lines.Line2D)
     assert display.errorbar_ is None
-    assert len(display.fill_between_) == 1
-    assert isinstance(display.fill_between_[0], mpl.collections.PolyCollection)
+    assert len(display.fill_between_) == 2
+    for fill_between in display.fill_between_:
+        assert isinstance(fill_between, mpl.collections.PolyCollection)
     _, legend_label = display.ax_.get_legend_handles_labels()
-    assert len(legend_label) == 1
+    assert len(legend_label) == 2
 
     std_display_style = "errorbar"
-    display = LearningCurveDisplay.from_estimator(
+    display = CurveDisplay.from_estimator(
         estimator,
         X,
         y,
-        train_sizes=train_sizes,
+        **specific_params,
         std_display_style=std_display_style,
     )
 
     assert display.lines_ is None
-    assert len(display.errorbar_) == 1
-    assert isinstance(display.errorbar_[0], mpl.container.ErrorbarContainer)
+    assert len(display.errorbar_) == 2
+    for errorbar in display.errorbar_:
+        assert isinstance(errorbar, mpl.container.ErrorbarContainer)
     assert display.fill_between_ is None
     _, legend_label = display.ax_.get_legend_handles_labels()
-    assert len(legend_label) == 1
+    assert len(legend_label) == 2
 
 
-def test_learning_curve_display_plot_kwargs(pyplot, data):
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_plot_kwargs(pyplot, data, CurveDisplay, specific_params):
     """Check the behaviour of the different plotting keyword arguments: `line_kw`,
     `fill_between_kw`, and `errorbar_kw`."""
     X, y = data
     estimator = DecisionTreeClassifier(random_state=0)
 
-    train_sizes = [0.3, 0.6, 0.9]
     std_display_style = "fill_between"
     line_kw = {"color": "red"}
     fill_between_kw = {"color": "red", "alpha": 1.0}
-    display = LearningCurveDisplay.from_estimator(
+    display = CurveDisplay.from_estimator(
         estimator,
         X,
         y,
-        train_sizes=train_sizes,
+        **specific_params,
         std_display_style=std_display_style,
         line_kw=line_kw,
         fill_between_kw=fill_between_kw,
@@ -342,13 +512,36 @@ def test_learning_curve_display_plot_kwargs(pyplot, data):
 
     std_display_style = "errorbar"
     errorbar_kw = {"color": "red"}
-    display = LearningCurveDisplay.from_estimator(
+    display = CurveDisplay.from_estimator(
         estimator,
         X,
         y,
-        train_sizes=train_sizes,
+        **specific_params,
         std_display_style=std_display_style,
         errorbar_kw=errorbar_kw,
     )
 
     assert display.errorbar_[0].lines[0].get_color() == "red"
+
+
+# TODO(1.5): to be removed
+def test_learning_curve_display_deprecate_log_scale(data, pyplot):
+    """Check that we warn for the deprecated parameter `log_scale`."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    with pytest.warns(FutureWarning, match="`log_scale` parameter is deprecated"):
+        display = LearningCurveDisplay.from_estimator(
+            estimator, X, y, train_sizes=[0.3, 0.6, 0.9], log_scale=True
+        )
+
+    assert display.ax_.get_xscale() == "log"
+    assert display.ax_.get_yscale() == "linear"
+
+    with pytest.warns(FutureWarning, match="`log_scale` parameter is deprecated"):
+        display = LearningCurveDisplay.from_estimator(
+            estimator, X, y, train_sizes=[0.3, 0.6, 0.9], log_scale=False
+        )
+
+    assert display.ax_.get_xscale() == "linear"
+    assert display.ax_.get_yscale() == "linear"
diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
index 74684e608d3c1..4c30bcdb6cac3 100644
--- a/sklearn/multiclass.py
+++ b/sklearn/multiclass.py
@@ -43,6 +43,7 @@
 from .base import BaseEstimator, ClassifierMixin, clone, is_classifier
 from .base import MultiOutputMixin
 from .base import MetaEstimatorMixin, is_regressor
+from .base import _fit_context
 from .preprocessing import LabelBinarizer
 from .metrics.pairwise import pairwise_distances_argmin
 from .utils import check_random_state
@@ -296,6 +297,10 @@ def __init__(self, estimator, *, n_jobs=None, verbose=0):
         self.n_jobs = n_jobs
         self.verbose = verbose
 
+    @_fit_context(
+        # OneVsRestClassifier.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """Fit underlying estimators.
 
@@ -313,8 +318,6 @@ def fit(self, X, y):
         self : object
             Instance of fitted estimator.
         """
-        self._validate_params()
-
         # A sparse LabelBinarizer, with sparse_output=True, has been shown to
         # outperform or match a dense label binarizer in all cases and has also
         # resulted in less or equal memory consumption in the fit_ovr function
@@ -348,6 +351,10 @@ def fit(self, X, y):
         return self
 
     @available_if(_estimators_has("partial_fit"))
+    @_fit_context(
+        # OneVsRestClassifier.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def partial_fit(self, X, y, classes=None):
         """Partially fit underlying estimators.
 
@@ -376,8 +383,6 @@ def partial_fit(self, X, y, classes=None):
             Instance of partially fitted estimator.
         """
         if _check_partial_fit_first_call(self, classes):
-            self._validate_params()
-
             if not hasattr(self.estimator, "partial_fit"):
                 raise ValueError(
                     ("Base estimator {0}, doesn't have partial_fit method").format(
@@ -655,6 +660,10 @@ def __init__(self, estimator, *, n_jobs=None):
         self.estimator = estimator
         self.n_jobs = n_jobs
 
+    @_fit_context(
+        # OneVsOneClassifier.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """Fit underlying estimators.
 
@@ -671,7 +680,6 @@ def fit(self, X, y):
         self : object
             The fitted underlying estimator.
         """
-        self._validate_params()
         # We need to validate the data because we do a safe_indexing later.
         X, y = self._validate_data(
             X, y, accept_sparse=["csr", "csc"], force_all_finite=False
@@ -706,6 +714,10 @@ def fit(self, X, y):
         return self
 
     @available_if(_estimators_has("partial_fit"))
+    @_fit_context(
+        # OneVsOneClassifier.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def partial_fit(self, X, y, classes=None):
         """Partially fit underlying estimators.
 
@@ -735,8 +747,6 @@ def partial_fit(self, X, y, classes=None):
         """
         first_call = _check_partial_fit_first_call(self, classes)
         if first_call:
-            self._validate_params()
-
             self.estimators_ = [
                 clone(self.estimator)
                 for _ in range(self.n_classes_ * (self.n_classes_ - 1) // 2)
@@ -968,6 +978,10 @@ def __init__(self, estimator, *, code_size=1.5, random_state=None, n_jobs=None):
         self.random_state = random_state
         self.n_jobs = n_jobs
 
+    @_fit_context(
+        # OutputCodeClassifier.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """Fit underlying estimators.
 
@@ -984,7 +998,6 @@ def fit(self, X, y):
         self : object
             Returns a fitted instance of self.
         """
-        self._validate_params()
         y = self._validate_data(X="no_validation", y=y)
 
         random_state = check_random_state(self.random_state)
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 90c1f04f7e46a..8bb954e976f4c 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -28,6 +28,7 @@
     RegressorMixin,
     clone,
     is_classifier,
+    _fit_context,
 )
 from .model_selection import cross_val_predict
 from .utils import _print_elapsed_time, check_random_state, Bunch
@@ -104,6 +105,10 @@ def __init__(self, estimator, *, n_jobs=None):
         self.n_jobs = n_jobs
 
     @_available_if_estimator_has("partial_fit")
+    @_fit_context(
+        # MultiOutput*.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def partial_fit(self, X, y, classes=None, sample_weight=None, **partial_fit_params):
         """Incrementally fit a separate model for each class output.
 
@@ -151,9 +156,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None, **partial_fit_para
 
         first_time = not hasattr(self, "estimators_")
 
-        if first_time:
-            self._validate_params()
-
         y = self._validate_data(X="no_validation", y=y, multi_output=True)
 
         if y.ndim == 1:
@@ -203,6 +205,10 @@ def partial_fit(self, X, y, classes=None, sample_weight=None, **partial_fit_para
 
         return self
 
+    @_fit_context(
+        # MultiOutput*.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None, **fit_params):
         """Fit the model to data, separately for each output variable.
 
@@ -230,8 +236,6 @@ def fit(self, X, y, sample_weight=None, **fit_params):
         self : object
             Returns a fitted instance.
         """
-        self._validate_params()
-
         if not hasattr(self.estimator, "fit"):
             raise ValueError("The base estimator should implement a fit method")
 
@@ -887,6 +891,10 @@ class labels for each estimator in the chain.
            [0.0321..., 0.9935..., 0.0625...]])
     """
 
+    @_fit_context(
+        # ClassifierChain.base_estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, Y, **fit_params):
         """Fit the model to data matrix X and targets Y.
 
@@ -917,8 +925,6 @@ def fit(self, X, Y, **fit_params):
                 "See the User Guide for more information."
             )
 
-        self._validate_params()
-
         super().fit(X, Y, **fit_params)
         self.classes_ = [
             estimator.classes_ for chain_idx, estimator in enumerate(self.estimators_)
@@ -1109,6 +1115,10 @@ class RegressorChain(MetaEstimatorMixin, RegressorMixin, _BaseChain):
            [2., 0.]])
     """
 
+    @_fit_context(
+        # RegressorChain.base_estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, Y, **fit_params):
         """Fit the model to data matrix X and targets Y.
 
@@ -1131,8 +1141,6 @@ def fit(self, X, Y, **fit_params):
         self : object
             Returns a fitted instance.
         """
-        self._validate_params()
-
         super().fit(X, Y, **fit_params)
         return self
 
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index 20858ac8b5577..76d7189385828 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -22,6 +22,7 @@
 from scipy.special import logsumexp
 
 from .base import BaseEstimator, ClassifierMixin
+from .base import _fit_context
 from .preprocessing import binarize
 from .preprocessing import LabelBinarizer
 from .preprocessing import label_binarize
@@ -239,6 +240,7 @@ def __init__(self, *, priors=None, var_smoothing=1e-9):
         self.priors = priors
         self.var_smoothing = var_smoothing
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit Gaussian Naive Bayes according to X, y.
 
@@ -262,7 +264,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         y = self._validate_data(y=y)
         return self._partial_fit(
             X, y, np.unique(y), _refit=True, sample_weight=sample_weight
@@ -346,6 +347,7 @@ def _update_mean_variance(n_past, mu, var, X, sample_weight=None):
 
         return total_mu, total_var
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y, classes=None, sample_weight=None):
         """Incremental fit on a batch of samples.
 
@@ -386,8 +388,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         return self._partial_fit(
             X, y, classes, _refit=False, sample_weight=sample_weight
         )
@@ -643,6 +643,7 @@ def _check_alpha(self):
             return np.maximum(alpha, alpha_lower_bound)
         return alpha
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y, classes=None, sample_weight=None):
         """Incremental fit on a batch of samples.
 
@@ -682,9 +683,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
         """
         first_call = not hasattr(self, "classes_")
 
-        if first_call:
-            self._validate_params()
-
         X, y = self._check_X_y(X, y, reset=first_call)
         _, n_features = X.shape
 
@@ -728,6 +726,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
         self._update_class_log_prior(class_prior=class_prior)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit Naive Bayes classifier according to X, y.
 
@@ -748,7 +747,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X, y = self._check_X_y(X, y)
         _, n_features = X.shape
 
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index dbc070987d5d0..e3e2049a8f8e5 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -18,6 +18,7 @@
 from ._base import _get_weights
 from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin
 from ..base import ClassifierMixin
+from ..base import _fit_context
 from ..metrics._pairwise_distances_reduction import ArgKminClassMode
 from ..utils._param_validation import StrOptions
 from sklearn.neighbors._base import _check_precomputed
@@ -203,6 +204,10 @@ def __init__(
         )
         self.weights = weights
 
+    @_fit_context(
+        # KNeighborsClassifier.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """Fit the k-nearest neighbors classifier from the training dataset.
 
@@ -221,8 +226,6 @@ def fit(self, X, y):
         self : KNeighborsClassifier
             The fitted k-nearest neighbors classifier.
         """
-        self._validate_params()
-
         return self._fit(X, y)
 
     def predict(self, X):
@@ -572,6 +575,10 @@ def __init__(
         self.weights = weights
         self.outlier_label = outlier_label
 
+    @_fit_context(
+        # RadiusNeighborsClassifier.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """Fit the radius neighbors classifier from the training dataset.
 
@@ -590,7 +597,6 @@ def fit(self, X, y):
         self : RadiusNeighborsClassifier
             The fitted radius neighbors classifier.
         """
-        self._validate_params()
         self._fit(X, y)
 
         classes_ = self.classes_
diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
index 418761c2d21ee..e815d12e293c9 100644
--- a/sklearn/neighbors/_graph.py
+++ b/sklearn/neighbors/_graph.py
@@ -8,6 +8,7 @@
 from ._base import NeighborsBase
 from ._unsupervised import NearestNeighbors
 from ..base import TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..utils._param_validation import StrOptions
 from ..utils.validation import check_is_fitted
 
@@ -372,6 +373,10 @@ def __init__(
         )
         self.mode = mode
 
+    @_fit_context(
+        # KNeighborsTransformer.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None):
         """Fit the k-nearest neighbors transformer from the training dataset.
 
@@ -388,7 +393,6 @@ def fit(self, X, y=None):
         self : KNeighborsTransformer
             The fitted k-nearest neighbors transformer.
         """
-        self._validate_params()
         self._fit(X)
         self._n_features_out = self.n_samples_fit_
         return self
@@ -600,6 +604,10 @@ def __init__(
         )
         self.mode = mode
 
+    @_fit_context(
+        # RadiusNeighborsTransformer.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None):
         """Fit the radius neighbors transformer from the training dataset.
 
@@ -617,7 +625,6 @@ def fit(self, X, y=None):
         self : RadiusNeighborsTransformer
             The fitted radius neighbors transformer.
         """
-        self._validate_params()
         self._fit(X)
         self._n_features_out = self.n_samples_fit_
         return self
diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py
index f285b03403b5f..7f7b38497d209 100644
--- a/sklearn/neighbors/_kde.py
+++ b/sklearn/neighbors/_kde.py
@@ -10,6 +10,7 @@
 from scipy.special import gammainc
 
 from ..base import BaseEstimator
+from ..base import _fit_context
 from ..neighbors._base import VALID_METRICS
 from ..utils import check_random_state
 from ..utils.validation import _check_sample_weight, check_is_fitted
@@ -185,6 +186,10 @@ def _choose_algorithm(self, algorithm, metric):
                 )
             return algorithm
 
+    @_fit_context(
+        # KernelDensity.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None, sample_weight=None):
         """Fit the Kernel Density model on the data.
 
@@ -208,8 +213,6 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         algorithm = self._choose_algorithm(self.algorithm, self.metric)
 
         if isinstance(self.bandwidth, str):
diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py
index 90b3b0aa3d8ce..40cdc9ab5fb9d 100644
--- a/sklearn/neighbors/_lof.py
+++ b/sklearn/neighbors/_lof.py
@@ -8,6 +8,7 @@
 from ._base import NeighborsBase
 from ._base import KNeighborsMixin
 from ..base import OutlierMixin
+from ..base import _fit_context
 from numbers import Real
 
 from ..utils._param_validation import Interval, StrOptions
@@ -256,6 +257,10 @@ def fit_predict(self, X, y=None):
 
         return self.fit(X)._predict()
 
+    @_fit_context(
+        # LocalOutlierFactor.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None):
         """Fit the local outlier factor detector from the training dataset.
 
@@ -273,8 +278,6 @@ def fit(self, X, y=None):
         self : LocalOutlierFactor
             The fitted local outlier factor detector.
         """
-        self._validate_params()
-
         self._fit(X)
 
         n_samples = self.n_samples_fit_
diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py
index 4a83fcc7bc080..246f0adcb36ad 100644
--- a/sklearn/neighbors/_nca.py
+++ b/sklearn/neighbors/_nca.py
@@ -15,6 +15,7 @@
 from ..utils.extmath import softmax
 from ..metrics import pairwise_distances
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..preprocessing import LabelEncoder
 from ..decomposition import PCA
 from ..utils.multiclass import check_classification_targets
@@ -215,6 +216,7 @@ def __init__(
         self.verbose = verbose
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit the model according to the given training data.
 
@@ -231,8 +233,6 @@ def fit(self, X, y):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         # Validate the inputs X and y, and converts y to numerical classes.
         X, y = self._validate_data(X, y, ensure_min_samples=2)
         check_classification_targets(y)
diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py
index 7b9c2479747d3..315393bf597e4 100644
--- a/sklearn/neighbors/_nearest_centroid.py
+++ b/sklearn/neighbors/_nearest_centroid.py
@@ -13,6 +13,7 @@
 from scipy import sparse as sp
 
 from ..base import BaseEstimator, ClassifierMixin
+from ..base import _fit_context
 from ..metrics.pairwise import pairwise_distances_argmin
 from ..preprocessing import LabelEncoder
 from ..utils.validation import check_is_fitted
@@ -122,6 +123,7 @@ def __init__(self, metric="euclidean", *, shrink_threshold=None):
         self.metric = metric
         self.shrink_threshold = shrink_threshold
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """
         Fit the NearestCentroid model according to the given training data.
@@ -140,8 +142,6 @@ def fit(self, X, y):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         if isinstance(self.metric, str) and self.metric not in (
             "manhattan",
             "euclidean",
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index 003b534074ecd..b2050345c9833 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -17,6 +17,7 @@
 from ._base import _get_weights
 from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin
 from ..base import RegressorMixin
+from ..base import _fit_context
 from ..utils._param_validation import StrOptions
 
 
@@ -194,6 +195,10 @@ def _more_tags(self):
         # For cross-validation routines to split data correctly
         return {"pairwise": self.metric == "precomputed"}
 
+    @_fit_context(
+        # KNeighborsRegressor.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """Fit the k-nearest neighbors regressor from the training dataset.
 
@@ -212,8 +217,6 @@ def fit(self, X, y):
         self : KNeighborsRegressor
             The fitted k-nearest neighbors regressor.
         """
-        self._validate_params()
-
         return self._fit(X, y)
 
     def predict(self, X):
@@ -422,6 +425,10 @@ def __init__(
         )
         self.weights = weights
 
+    @_fit_context(
+        # RadiusNeighborsRegressor.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """Fit the radius neighbors regressor from the training dataset.
 
@@ -440,7 +447,6 @@ def fit(self, X, y):
         self : RadiusNeighborsRegressor
             The fitted radius neighbors regressor.
         """
-        self._validate_params()
         return self._fit(X, y)
 
     def predict(self, X):
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index 53e69495b9ed4..05607f0bd0c71 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -1,4 +1,5 @@
 """Unsupervised nearest neighbors learner"""
+from ..base import _fit_context
 from ._base import NeighborsBase
 from ._base import KNeighborsMixin
 from ._base import RadiusNeighborsMixin
@@ -155,6 +156,10 @@ def __init__(
             n_jobs=n_jobs,
         )
 
+    @_fit_context(
+        # NearestNeighbors.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None):
         """Fit the nearest neighbors estimator from the training dataset.
 
@@ -172,5 +177,4 @@ def fit(self, X, y=None):
         self : NearestNeighbors
             The fitted nearest neighbors estimator.
         """
-        self._validate_params()
         return self._fit(X)
diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index 5c4bc5a39aa2d..fb8eab2f1776d 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -21,6 +21,7 @@
     RegressorMixin,
 )
 from ..base import is_classifier
+from ..base import _fit_context
 from ._base import ACTIVATIONS, DERIVATIVES, LOSS_FUNCTIONS
 from ._stochastic_optimizers import SGDOptimizer, AdamOptimizer
 from ..metrics import accuracy_score, r2_score
@@ -727,6 +728,7 @@ def _update_no_improvement_count(self, early_stopping, X_val, y_val):
             if self.loss_curve_[-1] < self.best_loss_:
                 self.best_loss_ = self.loss_curve_[-1]
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit the model to data matrix X and target(s) y.
 
@@ -744,8 +746,6 @@ def fit(self, X, y):
         self : object
             Returns a trained MLP model.
         """
-        self._validate_params()
-
         return self._fit(X, y, incremental=False)
 
     def _check_solver(self):
@@ -1170,6 +1170,7 @@ def _score(self, X, y):
         return accuracy_score(y, self._predict(X, check_input=False))
 
     @available_if(lambda est: est._check_solver())
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y, classes=None):
         """Update the model with a single iteration over the given data.
 
@@ -1194,9 +1195,6 @@ def partial_fit(self, X, y, classes=None):
         self : object
             Trained MLP model.
         """
-        if not hasattr(self, "coefs_"):
-            self._validate_params()
-
         if _check_partial_fit_first_call(self, classes):
             self._label_binarizer = LabelBinarizer()
             if type_of_target(y).startswith("multilabel"):
@@ -1624,6 +1622,7 @@ def _validate_input(self, X, y, incremental, reset):
         return X, y
 
     @available_if(lambda est: est._check_solver)
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y):
         """Update the model with a single iteration over the given data.
 
@@ -1640,7 +1639,4 @@ def partial_fit(self, X, y):
         self : object
             Trained MLP model.
         """
-        if not hasattr(self, "coefs_"):
-            self._validate_params()
-
         return self._fit(X, y, incremental=True)
diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py
index 0624145116180..2ded6533d8d96 100644
--- a/sklearn/neural_network/_rbm.py
+++ b/sklearn/neural_network/_rbm.py
@@ -17,6 +17,7 @@
 from ..base import BaseEstimator
 from ..base import TransformerMixin
 from ..base import ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..utils import check_random_state
 from ..utils import gen_even_slices
 from ..utils.extmath import safe_sparse_dot
@@ -269,6 +270,7 @@ def gibbs(self, v):
 
         return v_
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None):
         """Fit the model to the partial segment of the data X.
 
@@ -285,9 +287,6 @@ def partial_fit(self, X, y=None):
         self : BernoulliRBM
             The fitted model.
         """
-
-        self._validate_params()
-
         first_pass = not hasattr(self, "components_")
         X = self._validate_data(
             X, accept_sparse="csr", dtype=np.float64, reset=first_pass
@@ -380,6 +379,7 @@ def score_samples(self, X):
         fe_ = self._free_energy(v_)
         return v.shape[1] * log_logistic(fe_ - fe)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model to the data X.
 
@@ -396,9 +396,6 @@ def fit(self, X, y=None):
         self : BernoulliRBM
             The fitted model.
         """
-
-        self._validate_params()
-
         X = self._validate_data(X, accept_sparse="csr", dtype=(np.float64, np.float32))
         n_samples = X.shape[0]
         rng = check_random_state(self.random_state)
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 8c5dc3bd82917..43b6b7eb0c939 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -16,6 +16,7 @@
 from scipy import sparse
 
 from .base import clone, TransformerMixin
+from .base import _fit_context
 from .preprocessing import FunctionTransformer
 from .utils._estimator_html_repr import _VisualBlock
 from .utils.metaestimators import available_if
@@ -385,6 +386,10 @@ def _fit(self, X, y=None, **fit_params_steps):
             self.steps[step_idx] = (name, fitted_transformer)
         return X
 
+    @_fit_context(
+        # estimators in Pipeline.steps are not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None, **fit_params):
         """Fit the model.
 
@@ -411,7 +416,6 @@ def fit(self, X, y=None, **fit_params):
         self : object
             Pipeline with fitted steps.
         """
-        self._validate_params()
         fit_params_steps = self._check_fit_params(**fit_params)
         Xt = self._fit(X, y, **fit_params_steps)
         with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
@@ -429,6 +433,10 @@ def _can_fit_transform(self):
         )
 
     @available_if(_can_fit_transform)
+    @_fit_context(
+        # estimators in Pipeline.steps are not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit_transform(self, X, y=None, **fit_params):
         """Fit the model and transform with the final estimator.
 
@@ -456,7 +464,6 @@ def fit_transform(self, X, y=None, **fit_params):
         Xt : ndarray of shape (n_samples, n_transformed_features)
             Transformed samples.
         """
-        self._validate_params()
         fit_params_steps = self._check_fit_params(**fit_params)
         Xt = self._fit(X, y, **fit_params_steps)
 
@@ -505,6 +512,10 @@ def predict(self, X, **predict_params):
         return self.steps[-1][1].predict(Xt, **predict_params)
 
     @available_if(_final_estimator_has("fit_predict"))
+    @_fit_context(
+        # estimators in Pipeline.steps are not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit_predict(self, X, y=None, **fit_params):
         """Transform the data, and apply `fit_predict` with the final estimator.
 
@@ -533,7 +544,6 @@ def fit_predict(self, X, y=None, **fit_params):
         y_pred : ndarray
             Result of calling `fit_predict` on the final estimator.
         """
-        self._validate_params()
         fit_params_steps = self._check_fit_params(**fit_params)
         Xt = self._fit(X, y, **fit_params_steps)
 
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 013f1f57e9373..139022a9897e6 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -22,6 +22,7 @@
     TransformerMixin,
     OneToOneFeatureMixin,
     ClassNamePrefixFeaturesOutMixin,
+    _fit_context,
 )
 from ..utils import check_array
 from ..utils._param_validation import Interval, Options, StrOptions, validate_params
@@ -435,6 +436,7 @@ def fit(self, X, y=None):
         self._reset()
         return self.partial_fit(X, y)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None):
         """Online computation of min and max on X for later scaling.
 
@@ -456,8 +458,6 @@ def partial_fit(self, X, y=None):
         self : object
             Fitted scaler.
         """
-        self._validate_params()
-
         feature_range = self.feature_range
         if feature_range[0] >= feature_range[1]:
             raise ValueError(
@@ -838,6 +838,7 @@ def fit(self, X, y=None, sample_weight=None):
         self._reset()
         return self.partial_fit(X, y, sample_weight)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None, sample_weight=None):
         """Online computation of mean and std on X for later scaling.
 
@@ -870,8 +871,6 @@ def partial_fit(self, X, y=None, sample_weight=None):
         self : object
             Fitted scaler.
         """
-        self._validate_params()
-
         first_call = not hasattr(self, "n_samples_seen_")
         X = self._validate_data(
             X,
@@ -1183,6 +1182,7 @@ def fit(self, X, y=None):
         self._reset()
         return self.partial_fit(X, y)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None):
         """Online computation of max absolute value of X for later scaling.
 
@@ -1204,8 +1204,6 @@ def partial_fit(self, X, y=None):
         self : object
             Fitted scaler.
         """
-        self._validate_params()
-
         first_pass = not hasattr(self, "n_samples_seen_")
         X = self._validate_data(
             X,
@@ -1514,6 +1512,7 @@ def __init__(
         self.unit_variance = unit_variance
         self.copy = copy
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Compute the median and quantiles to be used for scaling.
 
@@ -1531,8 +1530,6 @@ def fit(self, X, y=None):
         self : object
             Fitted scaler.
         """
-        self._validate_params()
-
         # at fit, convert sparse matrices to csc for optimized computation of
         # the quantiles
         X = self._validate_data(
@@ -1972,6 +1969,7 @@ def __init__(self, norm="l2", *, copy=True):
         self.norm = norm
         self.copy = copy
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Only validates estimator's parameters.
 
@@ -1991,7 +1989,6 @@ def fit(self, X, y=None):
         self : object
             Fitted transformer.
         """
-        self._validate_params()
         self._validate_data(X, accept_sparse="csr")
         return self
 
@@ -2155,6 +2152,7 @@ def __init__(self, *, threshold=0.0, copy=True):
         self.threshold = threshold
         self.copy = copy
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Only validates estimator's parameters.
 
@@ -2174,7 +2172,6 @@ def fit(self, X, y=None):
         self : object
             Fitted transformer.
         """
-        self._validate_params()
         self._validate_data(X, accept_sparse="csr")
         return self
 
@@ -2634,6 +2631,7 @@ def _sparse_fit(self, X, random_state):
         # https://github.com/numpy/numpy/issues/14685
         self.quantiles_ = np.maximum.accumulate(self.quantiles_)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Compute the quantiles used for transforming.
 
@@ -2653,8 +2651,6 @@ def fit(self, X, y=None):
         self : object
            Fitted transformer.
         """
-        self._validate_params()
-
         if self.n_quantiles > self.subsample:
             raise ValueError(
                 "The number of quantiles cannot be greater than"
@@ -3101,6 +3097,7 @@ def __init__(self, method="yeo-johnson", *, standardize=True, copy=True):
         self.standardize = standardize
         self.copy = copy
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Estimate the optimal parameter lambda for each feature.
 
@@ -3120,10 +3117,10 @@ def fit(self, X, y=None):
         self : object
             Fitted transformer.
         """
-        self._validate_params()
         self._fit(X, y=y, force_transform=False)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None):
         """Fit `PowerTransformer` to `X`, then transform `X`.
 
@@ -3141,7 +3138,6 @@ def fit_transform(self, X, y=None):
         X_new : ndarray of shape (n_samples, n_features)
             Transformed data.
         """
-        self._validate_params()
         return self._fit(X, y, force_transform=True)
 
     def _fit(self, X, y=None, force_transform=False):
@@ -3150,24 +3146,37 @@ def _fit(self, X, y=None, force_transform=False):
         if not self.copy and not force_transform:  # if call from fit()
             X = X.copy()  # force copy so that fit does not change X inplace
 
+        n_samples = X.shape[0]
+        mean = np.mean(X, axis=0, dtype=np.float64)
+        var = np.var(X, axis=0, dtype=np.float64)
+
         optim_function = {
             "box-cox": self._box_cox_optimize,
             "yeo-johnson": self._yeo_johnson_optimize,
         }[self.method]
+
+        transform_function = {
+            "box-cox": boxcox,
+            "yeo-johnson": self._yeo_johnson_transform,
+        }[self.method]
+
         with np.errstate(invalid="ignore"):  # hide NaN warnings
-            self.lambdas_ = np.array([optim_function(col) for col in X.T])
+            self.lambdas_ = np.empty(X.shape[1], dtype=X.dtype)
+            for i, col in enumerate(X.T):
+                # For yeo-johnson, leave constant features unchanged
+                # lambda=1 corresponds to the identity transformation
+                is_constant_feature = _is_constant_feature(var[i], mean[i], n_samples)
+                if self.method == "yeo-johnson" and is_constant_feature:
+                    self.lambdas_[i] = 1.0
+                    continue
+
+                self.lambdas_[i] = optim_function(col)
 
-        if self.standardize or force_transform:
-            transform_function = {
-                "box-cox": boxcox,
-                "yeo-johnson": self._yeo_johnson_transform,
-            }[self.method]
-            for i, lmbda in enumerate(self.lambdas_):
-                with np.errstate(invalid="ignore"):  # hide NaN warnings
-                    X[:, i] = transform_function(X[:, i], lmbda)
+                if self.standardize or force_transform:
+                    X[:, i] = transform_function(X[:, i], self.lambdas_[i])
 
         if self.standardize:
-            self._scaler = StandardScaler(copy=False)
+            self._scaler = StandardScaler(copy=False).set_output(transform="default")
             if force_transform:
                 X = self._scaler.fit_transform(X)
             else:
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
index 220950586a6ef..ac7432027f462 100644
--- a/sklearn/preprocessing/_discretization.py
+++ b/sklearn/preprocessing/_discretization.py
@@ -11,6 +11,7 @@
 from . import OneHotEncoder
 
 from ..base import BaseEstimator, TransformerMixin
+from ..base import _fit_context
 from ..utils._param_validation import Hidden, Interval, StrOptions, Options
 from ..utils.validation import check_array
 from ..utils.validation import check_is_fitted
@@ -192,6 +193,7 @@ def __init__(
         self.subsample = subsample
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None, sample_weight=None):
         """
         Fit the estimator.
@@ -216,7 +218,6 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X = self._validate_data(X, dtype="numeric")
 
         if self.dtype in (np.float64, np.float32):
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 1fc4b16a52467..de3f983d7ae6f 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -10,6 +10,7 @@
 from scipy import sparse
 
 from ..base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
+from ..base import _fit_context
 from ..utils import check_array, is_scalar_nan, _safe_indexing
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _check_feature_names_in
@@ -953,6 +954,7 @@ def _compute_n_features_outs(self):
 
         return output
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """
         Fit OneHotEncoder to X.
@@ -971,8 +973,6 @@ def fit(self, X, y=None):
         self
             Fitted encoder.
         """
-        self._validate_params()
-
         if self.sparse != "deprecated":
             warnings.warn(
                 (
@@ -1446,6 +1446,7 @@ def __init__(
         self.min_frequency = min_frequency
         self.max_categories = max_categories
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """
         Fit the OrdinalEncoder to X.
@@ -1464,8 +1465,6 @@ def fit(self, X, y=None):
         self : object
             Fitted encoder.
         """
-        self._validate_params()
-
         if self.handle_unknown == "use_encoded_value":
             if is_scalar_nan(self.unknown_value):
                 if np.dtype(self.dtype).kind != "f":
diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py
index c250c5cd0226e..d7bf1810e61c0 100644
--- a/sklearn/preprocessing/_function_transformer.py
+++ b/sklearn/preprocessing/_function_transformer.py
@@ -3,6 +3,7 @@
 import numpy as np
 
 from ..base import BaseEstimator, TransformerMixin
+from ..base import _fit_context
 from ..utils.metaestimators import available_if
 from ..utils.validation import (
     _allclose_dense_sparse,
@@ -197,6 +198,7 @@ def _check_inverse_transform(self, X):
                 UserWarning,
             )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit transformer by checking X.
 
@@ -216,7 +218,6 @@ def fit(self, X, y=None):
         self : object
             FunctionTransformer class instance.
         """
-        self._validate_params()
         X = self._check_input(X, reset=True)
         if self.check_inverse and not (self.func is None or self.inverse_func is None):
             self._check_inverse_transform(X)
diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
index ca8607b06c2e2..f656329607ee3 100644
--- a/sklearn/preprocessing/_label.py
+++ b/sklearn/preprocessing/_label.py
@@ -16,7 +16,7 @@
 import scipy.sparse as sp
 
 from ..base import BaseEstimator, TransformerMixin
-
+from ..base import _fit_context
 from ..utils.sparsefuncs import min_max_axis
 from ..utils._param_validation import Interval, validate_params
 from ..utils import column_or_1d
@@ -268,6 +268,7 @@ def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
         self.pos_label = pos_label
         self.sparse_output = sparse_output
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, y):
         """Fit label binarizer.
 
@@ -282,9 +283,6 @@ def fit(self, y):
         self : object
             Returns the instance itself.
         """
-
-        self._validate_params()
-
         if self.neg_label >= self.pos_label:
             raise ValueError(
                 f"neg_label={self.neg_label} must be strictly less than "
@@ -761,6 +759,7 @@ def __init__(self, *, classes=None, sparse_output=False):
         self.classes = classes
         self.sparse_output = sparse_output
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, y):
         """Fit the label sets binarizer, storing :term:`classes_`.
 
@@ -776,7 +775,6 @@ def fit(self, y):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         self._cached_dict = None
 
         if self.classes is None:
@@ -794,6 +792,7 @@ def fit(self, y):
         self.classes_[:] = classes
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, y):
         """Fit the label sets binarizer and transform the given label sets.
 
@@ -814,7 +813,6 @@ def fit_transform(self, y):
         if self.classes is not None:
             return self.fit(y).transform(y)
 
-        self._validate_params()
         self._cached_dict = None
 
         # Automatically increment on new class
diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py
index 08ccf6355fc4e..1dfee8a088114 100644
--- a/sklearn/preprocessing/_polynomial.py
+++ b/sklearn/preprocessing/_polynomial.py
@@ -12,6 +12,7 @@
 from scipy.special import comb
 
 from ..base import BaseEstimator, TransformerMixin
+from ..base import _fit_context
 from ..utils import check_array
 from ..utils.fixes import sp_version, parse_version
 from ..utils.validation import check_is_fitted, FLOAT_DTYPES, _check_sample_weight
@@ -299,6 +300,7 @@ def get_feature_names_out(self, input_features=None):
             feature_names.append(name)
         return np.asarray(feature_names, dtype=object)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """
         Compute number of output features.
@@ -316,7 +318,6 @@ def fit(self, X, y=None):
         self : object
             Fitted transformer.
         """
-        self._validate_params()
         _, n_features = self._validate_data(X, accept_sparse=True).shape
 
         if isinstance(self.degree, Integral):
@@ -802,6 +803,7 @@ def get_feature_names_out(self, input_features=None):
                 feature_names.append(f"{input_features[i]}_sp_{j}")
         return np.asarray(feature_names, dtype=object)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None, sample_weight=None):
         """Compute knot positions of splines.
 
@@ -823,8 +825,6 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Fitted transformer.
         """
-        self._validate_params()
-
         X = self._validate_data(
             X,
             reset=True,
diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py
index 9100d72194a32..9dd33ddfa3cce 100644
--- a/sklearn/preprocessing/_target_encoder.py
+++ b/sklearn/preprocessing/_target_encoder.py
@@ -4,6 +4,7 @@
 
 from ._encoders import _BaseEncoder
 from ..base import OneToOneFeatureMixin
+from ..base import _fit_context
 from ._target_encoder_fast import _fit_encoding_fast
 from ._target_encoder_fast import _fit_encoding_fast_auto_smooth
 from ..utils.validation import _check_y, check_consistent_length
@@ -176,6 +177,7 @@ def __init__(
         self.shuffle = shuffle
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit the :class:`TargetEncoder` to X and y.
 
@@ -192,10 +194,10 @@ def fit(self, X, y):
         self : object
             Fitted encoder.
         """
-        self._validate_params()
         self._fit_encodings_all(X, y)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y):
         """Fit :class:`TargetEncoder` and transform X with the target encoding.
 
@@ -219,7 +221,6 @@ def fit_transform(self, X, y):
         """
         from ..model_selection import KFold, StratifiedKFold  # avoid circular import
 
-        self._validate_params()
         X_ordinal, X_known_mask, y, n_categories = self._fit_encodings_all(X, y)
 
         # The cv splitter is voluntarily restricted to *KFold to enforce non
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 2e6fd810fedac..c00de906a7dbb 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -2669,3 +2669,22 @@ def test_kernel_centerer_feature_names_out():
     names_out = centerer.get_feature_names_out()
     samples_out2 = X_pairwise.shape[1]
     assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)])
+
+
+@pytest.mark.parametrize("standardize", [True, False])
+def test_power_transformer_constant_feature(standardize):
+    """Check that PowerTransfomer leaves constant features unchanged."""
+    X = [[-2, 0, 2], [-2, 0, 2], [-2, 0, 2]]
+
+    pt = PowerTransformer(method="yeo-johnson", standardize=standardize).fit(X)
+
+    assert_allclose(pt.lambdas_, [1, 1, 1])
+
+    Xft = pt.fit_transform(X)
+    Xt = pt.transform(X)
+
+    for Xt_ in [Xft, Xt]:
+        if standardize:
+            assert_allclose(Xt_, np.zeros_like(X))
+        else:
+            assert_allclose(Xt_, X)
diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py
index 9e9620e089521..ca0ee41784ab5 100644
--- a/sklearn/random_projection.py
+++ b/sklearn/random_projection.py
@@ -36,7 +36,7 @@
 
 from .base import BaseEstimator, TransformerMixin
 from .base import ClassNamePrefixFeaturesOutMixin
-
+from .base import _fit_context
 from .utils import check_random_state
 from .utils._param_validation import Interval, StrOptions, validate_params
 from .utils.extmath import safe_sparse_dot
@@ -356,6 +356,7 @@ def _compute_inverse_components(self):
             components = components.toarray()
         return linalg.pinv(components, check_finite=False)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Generate a sparse random projection matrix.
 
@@ -374,7 +375,6 @@ def fit(self, X, y=None):
         self : object
             BaseRandomProjection class instance.
         """
-        self._validate_params()
         X = self._validate_data(
             X, accept_sparse=["csr", "csc"], dtype=[np.float64, np.float32]
         )
diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py
index 95fad0713d558..9d7786bc1d67e 100644
--- a/sklearn/semi_supervised/_label_propagation.py
+++ b/sklearn/semi_supervised/_label_propagation.py
@@ -64,6 +64,7 @@
 from scipy.sparse import csgraph
 
 from ..base import BaseEstimator, ClassifierMixin
+from ..base import _fit_context
 from ..metrics.pairwise import rbf_kernel
 from ..neighbors import NearestNeighbors
 from ..utils.extmath import safe_sparse_dot
@@ -230,6 +231,7 @@ class labels.
         probabilities /= normalizer
         return probabilities
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit a semi-supervised label propagation model to X.
 
@@ -254,7 +256,6 @@ def fit(self, X, y):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X, y = self._validate_data(
             X,
             y,
diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py
index 2438658ed89c8..c4706df1754da 100644
--- a/sklearn/semi_supervised/_self_training.py
+++ b/sklearn/semi_supervised/_self_training.py
@@ -4,6 +4,7 @@
 import numpy as np
 
 from ..base import MetaEstimatorMixin, clone, BaseEstimator
+from ..base import _fit_context
 from ..utils._param_validation import HasMethods, Interval, StrOptions
 from ..utils.validation import check_is_fitted
 from ..utils.metaestimators import available_if
@@ -171,6 +172,10 @@ def __init__(
         self.max_iter = max_iter
         self.verbose = verbose
 
+    @_fit_context(
+        # SelfTrainingClassifier.base_estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """
         Fit self-training classifier using `X`, `y` as training data.
@@ -189,8 +194,6 @@ def fit(self, X, y):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         # we need row slicing support for sparce matrices, but costly finiteness check
         # can be delegated to the base estimator.
         X, y = self._validate_data(
diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py
index 55919099e027c..a54c31cecb6e1 100644
--- a/sklearn/svm/_base.py
+++ b/sklearn/svm/_base.py
@@ -11,6 +11,7 @@
 from . import _liblinear as liblinear  # type: ignore
 from . import _libsvm_sparse as libsvm_sparse  # type: ignore
 from ..base import BaseEstimator, ClassifierMixin
+from ..base import _fit_context
 from ..preprocessing import LabelEncoder
 from ..utils.multiclass import _ovr_decision_function
 from ..utils import check_array, check_random_state
@@ -143,6 +144,7 @@ def _more_tags(self):
         # Used by cross_val_score.
         return {"pairwise": self.kernel == "precomputed"}
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the SVM model according to the given training data.
 
@@ -176,8 +178,6 @@ def fit(self, X, y, sample_weight=None):
         If X is a dense array, then the other methods will not support sparse
         matrices as input.
         """
-        self._validate_params()
-
         rnd = check_random_state(self.random_state)
 
         sparse = sp.isspmatrix(X)
diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index e035e74a05e2c..a438d007da970 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -5,6 +5,7 @@
 
 from ._base import _fit_liblinear, _get_liblinear_solver_type, BaseSVC, BaseLibSVM
 from ..base import BaseEstimator, RegressorMixin, OutlierMixin
+from ..base import _fit_context
 from ..linear_model._base import LinearClassifierMixin, SparseCoefMixin, LinearModel
 from ..utils import deprecated
 from ..utils.validation import _num_samples
@@ -272,6 +273,7 @@ def __init__(
         self.penalty = penalty
         self.loss = loss
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the model according to the given training data.
 
@@ -296,8 +298,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             An instance of the estimator.
         """
-        self._validate_params()
-
         X, y = self._validate_data(
             X,
             y,
@@ -529,6 +529,7 @@ def __init__(
         self.dual = dual
         self.loss = loss
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the model according to the given training data.
 
@@ -553,8 +554,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             An instance of the estimator.
         """
-        self._validate_params()
-
         X, y = self._validate_data(
             X,
             y,
diff --git a/sklearn/tests/test_metadata_routing.py b/sklearn/tests/test_metadata_routing.py
index 3b00b5a244ee8..a6e74c12f6e45 100644
--- a/sklearn/tests/test_metadata_routing.py
+++ b/sklearn/tests/test_metadata_routing.py
@@ -653,6 +653,21 @@ def fit(self, X, y, prop=None, **kwargs):
         Klass().fit(None, None)  # for coverage
 
 
+def test_removing_non_existing_param_raises():
+    """Test that removing a metadata using UNUSED which doesn't exist raises."""
+
+    class InvalidRequestRemoval(BaseEstimator):
+        # `fit` (in this class or a parent) requests `prop`, but we don't want
+        # it requested at all.
+        __metadata_request__fit = {"prop": metadata_routing.UNUSED}
+
+        def fit(self, X, y, **kwargs):
+            return self
+
+    with pytest.raises(ValueError, match="Trying to remove parameter"):
+        InvalidRequestRemoval().get_metadata_routing()
+
+
 def test_method_metadata_request():
     mmr = MethodMetadataRequest(owner="test", method="fit")
 
diff --git a/sklearn/tests/test_public_functions.py b/sklearn/tests/test_public_functions.py
index 3157e344cbef3..99f7f22d92e3d 100644
--- a/sklearn/tests/test_public_functions.py
+++ b/sklearn/tests/test_public_functions.py
@@ -241,6 +241,7 @@ def _check_function_param_validation(
     "sklearn.metrics.pairwise.manhattan_distances",
     "sklearn.metrics.pairwise.nan_euclidean_distances",
     "sklearn.metrics.pairwise.paired_cosine_distances",
+    "sklearn.metrics.pairwise.paired_distances",
     "sklearn.metrics.pairwise.paired_euclidean_distances",
     "sklearn.metrics.pairwise.paired_manhattan_distances",
     "sklearn.metrics.pairwise.polynomial_kernel",
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 4fdd8f27cd652..64a444db0b228 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -31,6 +31,7 @@
 from sklearn.base import RegressorMixin
 from sklearn.base import is_classifier
 from sklearn.base import MultiOutputMixin
+from sklearn.base import _fit_context
 from sklearn.utils import Bunch
 from sklearn.utils import check_random_state
 from sklearn.utils.validation import _check_sample_weight
@@ -120,6 +121,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
         "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None],
         "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")],
         "ccp_alpha": [Interval(Real, 0.0, None, closed="left")],
+        "store_leaf_values": [bool],
     }
 
     @abstractmethod
@@ -138,6 +140,7 @@ def __init__(
         min_impurity_decrease,
         class_weight=None,
         ccp_alpha=0.0,
+        store_leaf_values=False,
     ):
         self.criterion = criterion
         self.splitter = splitter
@@ -151,6 +154,7 @@ def __init__(
         self.min_impurity_decrease = min_impurity_decrease
         self.class_weight = class_weight
         self.ccp_alpha = ccp_alpha
+        self.store_leaf_values = store_leaf_values
 
     def get_depth(self):
         """Return the depth of the decision tree.
@@ -180,7 +184,7 @@ def get_n_leaves(self):
     def _support_missing_values(self, X):
         return not issparse(X) and self._get_tags()["allow_nan"]
 
-    def _compute_feature_has_missing(self, X):
+    def _compute_missing_values_in_feature_mask(self, X):
         """Return boolean mask denoting if there are missing values for each feature.
 
         This method also ensures that X is finite.
@@ -192,7 +196,7 @@ def _compute_feature_has_missing(self, X):
 
         Returns
         -------
-        feature_has_missing : ndarray of shape (n_features,), or None
+        missing_values_in_feature_mask : ndarray of shape (n_features,), or None
             Missing value mask. If missing values are not supported or there
             are no missing values, return None.
         """
@@ -213,13 +217,17 @@ def _compute_feature_has_missing(self, X):
         if not np.isnan(overall_sum):
             return None
 
-        feature_has_missing = _any_isnan_axis0(X)
-        return feature_has_missing
+        missing_values_in_feature_mask = _any_isnan_axis0(X)
+        return missing_values_in_feature_mask
 
     def _fit(
-        self, X, y, sample_weight=None, check_input=True, feature_has_missing=None
+        self,
+        X,
+        y,
+        sample_weight=None,
+        check_input=True,
+        missing_values_in_feature_mask=None,
     ):
-        self._validate_params()
         random_state = check_random_state(self.random_state)
 
         if check_input:
@@ -227,7 +235,7 @@ def _fit(
             # We can't pass multi_output=True because that would allow y to be
             # csr.
 
-            # _compute_feature_has_missing will check for finite values and
+            # _compute_missing_values_in_feature_mask will check for finite values and
             # compute the missing mask if the tree supports missing values
             check_X_params = dict(
                 dtype=DTYPE, accept_sparse="csc", force_all_finite=False
@@ -240,7 +248,9 @@ def _fit(
             else:
                 X = self._validate_data(X, **check_X_params)
 
-            feature_has_missing = self._compute_feature_has_missing(X)
+            missing_values_in_feature_mask = (
+                self._compute_missing_values_in_feature_mask(X)
+            )
             if issparse(X):
                 X.sort_indices()
 
@@ -388,7 +398,7 @@ def _fit(
             X,
             y,
             sample_weight,
-            feature_has_missing,
+            missing_values_in_feature_mask,
             min_samples_leaf,
             min_weight_leaf,
             max_leaf_nodes,
@@ -397,6 +407,9 @@ def _fit(
             random_state,
         )
 
+        if self.store_leaf_values:
+            self.leaf_nodes_samples_ = self.tree_.leaf_nodes_samples
+
         return self
 
     def _build_tree(
@@ -404,7 +417,7 @@ def _build_tree(
         X,
         y,
         sample_weight,
-        feature_has_missing,
+        missing_values_in_feature_mask,
         min_samples_leaf,
         min_weight_leaf,
         max_leaf_nodes,
@@ -483,6 +496,7 @@ def _build_tree(
                 min_weight_leaf,
                 max_depth,
                 self.min_impurity_decrease,
+                self.store_leaf_values,
             )
         else:
             builder = BestFirstTreeBuilder(
@@ -493,9 +507,10 @@ def _build_tree(
                 max_depth,
                 max_leaf_nodes,
                 self.min_impurity_decrease,
+                self.store_leaf_values,
             )
 
-        builder.build(self.tree_, X, y, sample_weight, feature_has_missing)
+        builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
 
         if self.n_outputs_ == 1 and is_classifier(self):
             self.n_classes_ = self.n_classes_[0]
@@ -551,6 +566,9 @@ def predict(self, X, check_input=True):
         """
         check_is_fitted(self)
         X = self._validate_X_predict(X, check_input)
+
+        # proba is a count matrix of leaves that fall into
+        # (n_samples, n_outputs, max_n_classes) array
         proba = self.tree_.predict(X)
         n_samples = X.shape[0]
 
@@ -577,6 +595,128 @@ def predict(self, X, check_input=True):
             else:
                 return proba[:, :, 0]
 
+    def get_leaf_node_samples(self, X, check_input=True):
+        """For each datapoint x in X, get the training samples in the leaf node.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Dataset to apply the forest to.
+
+        Returns
+        -------
+        leaf_nodes_samples : a list of array-like of shape
+                (n_leaf_node_samples, n_outputs)
+            Each sample is represented by the indices of the training samples that
+            reached the leaf node. The ``n_leaf_node_samples`` may vary between
+            samples, since the number of samples that fall in a leaf node is
+            variable.
+        """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "leaf node samples are not stored when store_leaf_values=False"
+            )
+
+        # get indices of leaves per sample (n_samples,)
+        X_leaves = self.apply(X, check_input=check_input)
+        n_samples = X_leaves.shape[0]
+
+        # get array of samples per leaf (n_node_samples, n_outputs)
+        leaf_samples = self.tree_.leaf_nodes_samples
+
+        leaf_nodes_samples = []
+        for idx in range(n_samples):
+            leaf_id = X_leaves[idx]
+            leaf_nodes_samples.append(leaf_samples[leaf_id])
+        return leaf_nodes_samples
+
+    def predict_quantiles(self, X, quantiles=0.5, method="nearest", check_input=True):
+        """Predict class or regression value for X at given quantiles.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+        quantiles : float, optional
+            The quantiles at which to evaluate, by default 0.5 (median).
+        method : str, optional
+            The method to interpolate, by default 'linear'. Can be any keyword
+            argument accepted by :func:`np.quantile`.
+        check_input : bool, optional
+            Whether or not to check input, by default True.
+
+        Returns
+        -------
+        predictions : array-like of shape (n_samples, n_outputs, len(quantiles))
+            The predicted quantiles.
+        """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "Predicting quantiles requires that the tree stores leaf node samples."
+            )
+
+        check_is_fitted(self)
+
+        # Check data
+        X = self._validate_X_predict(X, check_input)
+
+        if not isinstance(quantiles, (np.ndarray, list)):
+            quantiles = np.array([quantiles])
+
+        # get indices of leaves per sample
+        X_leaves = self.apply(X)
+
+        # get array of samples per leaf (n_node_samples, n_outputs)
+        leaf_samples = self.tree_.leaf_nodes_samples
+
+        # compute quantiles (n_samples, n_quantiles, n_outputs)
+        n_samples = X.shape[0]
+        n_quantiles = len(quantiles)
+        proba = np.zeros((n_samples, n_quantiles, self.n_outputs_))
+        for idx, leaf_id in enumerate(X_leaves):
+            # predict by taking the quantile across the samples in the leaf for
+            # each output
+            proba[idx, ...] = np.quantile(
+                leaf_samples[leaf_id], quantiles, axis=0, interpolation=method
+            )
+
+        # Classification
+        if is_classifier(self):
+            if self.n_outputs_ == 1:
+                # return the class with the highest probability for each quantile
+                # (n_samples, n_quantiles)
+                class_preds = np.zeros(
+                    (n_samples, n_quantiles), dtype=self.classes_.dtype
+                )
+                for i in range(n_quantiles):
+                    class_pred_per_sample = (
+                        proba[:, i, :].squeeze().astype(self.classes_.dtype)
+                    )
+                    class_preds[:, i] = self.classes_.take(
+                        class_pred_per_sample, axis=0
+                    )
+                return class_preds
+            else:
+                class_type = self.classes_[0].dtype
+                predictions = np.zeros(
+                    (n_samples, n_quantiles, self.n_outputs_), dtype=class_type
+                )
+                for k in range(self.n_outputs_):
+                    for i in range(n_quantiles):
+                        class_pred_per_sample = proba[:, i, k].squeeze().astype(int)
+                        predictions[:, i, k] = self.classes_[k].take(
+                            class_pred_per_sample, axis=0
+                        )
+
+                return predictions
+        # Regression
+        else:
+            if self.n_outputs_ == 1:
+                return proba[:, :, 0]
+
+            else:
+                return proba
+
     def apply(self, X, check_input=True):
         """Return the index of the leaf that each sample is predicted as.
 
@@ -851,6 +991,16 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the samples that fall into leaves in the ``tree_`` attribute.
+        Each leaf will store a 2D array corresponding to the samples that fall into it
+        keyed by node_id.
+
+        XXX: This is currently experimental and may change without notice.
+        Moreover, it can be improved upon since storing the samples twice is not ideal.
+        One could instead store the indices in ``y_train`` that fall into each leaf,
+        which would lower RAM/diskspace usage.
+
     Attributes
     ----------
     classes_ : ndarray of shape (n_classes,) or list of ndarray
@@ -896,6 +1046,9 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
         for basic usage of these attributes.
 
+    leaf_nodes_samples_ : dict
+        A dictionary of leaf node index and the y_train samples in that leaf.
+
     See Also
     --------
     DecisionTreeRegressor : A decision tree regressor.
@@ -965,6 +1118,7 @@ def __init__(
         min_impurity_decrease=0.0,
         class_weight=None,
         ccp_alpha=0.0,
+        store_leaf_values=False,
     ):
         super().__init__(
             criterion=criterion,
@@ -979,8 +1133,10 @@ def __init__(
             random_state=random_state,
             min_impurity_decrease=min_impurity_decrease,
             ccp_alpha=ccp_alpha,
+            store_leaf_values=store_leaf_values,
         )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None, check_input=True):
         """Build a decision tree classifier from the training set (X, y).
 
@@ -1327,6 +1483,7 @@ def __init__(
         max_leaf_nodes=None,
         min_impurity_decrease=0.0,
         ccp_alpha=0.0,
+        store_leaf_values=False,
     ):
         super().__init__(
             criterion=criterion,
@@ -1340,8 +1497,10 @@ def __init__(
             random_state=random_state,
             min_impurity_decrease=min_impurity_decrease,
             ccp_alpha=ccp_alpha,
+            store_leaf_values=store_leaf_values,
         )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None, check_input=True):
         """Build a decision tree regressor from the training set (X, y).
 
@@ -1653,6 +1812,7 @@ def __init__(
         min_impurity_decrease=0.0,
         class_weight=None,
         ccp_alpha=0.0,
+        store_leaf_values=False,
     ):
         super().__init__(
             criterion=criterion,
@@ -1667,6 +1827,7 @@ def __init__(
             min_impurity_decrease=min_impurity_decrease,
             random_state=random_state,
             ccp_alpha=ccp_alpha,
+            store_leaf_values=store_leaf_values,
         )
 
 
@@ -1880,6 +2041,7 @@ def __init__(
         min_impurity_decrease=0.0,
         max_leaf_nodes=None,
         ccp_alpha=0.0,
+        store_leaf_values=False,
     ):
         super().__init__(
             criterion=criterion,
@@ -1893,4 +2055,5 @@ def __init__(
             min_impurity_decrease=min_impurity_decrease,
             random_state=random_state,
             ccp_alpha=ccp_alpha,
+            store_leaf_values=store_leaf_values,
         )
diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index 721b475f40436..31c10ccfe4f93 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -92,7 +92,7 @@ cdef class Criterion(BaseCriterion):
 
     cdef void node_samples(
         self,
-        vector[vector[DOUBLE_t]]* dest
+        vector[vector[DOUBLE_t]]& dest
     ) noexcept nogil
 
 cdef class ClassificationCriterion(Criterion):
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index c3f08ec859bee..dfa64c1184df5 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -258,9 +258,17 @@ cdef class Criterion(BaseCriterion):
 
     cdef void node_samples(
         self,
-        vector[vector[DOUBLE_t]]* dest
+        vector[vector[DOUBLE_t]]& dest
     ) noexcept nogil:
-        cdef SIZE_t i, j
+        """Copy the samples of the current node into dest.
+
+        Parameters
+        ----------
+        dest : reference vector[vector[DOUBLE_t]]
+            The vector of vectors where the samples should be copied.
+            This is passed by reference and modified in place.
+        """
+        cdef SIZE_t i, j, k
 
         # Resize the destination vector of vectors
         dest.resize(self.n_node_samples)
@@ -272,7 +280,8 @@ cdef class Criterion(BaseCriterion):
 
             # Get the sample values for each output
             for k in range(self.n_outputs):
-                dest[i][k].push_back(self.y[j, k])
+                dest[i].push_back(self.y[j, k])
+
 
 cdef inline void _move_sums_classification(
     ClassificationCriterion criterion,
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index fb21f676e66cc..915b2baa30e94 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -104,10 +104,10 @@ cdef class Splitter(BaseSplitter):
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
-        const unsigned char[::1] feature_has_missing,
+        const unsigned char[::1] missing_values_in_feature_mask,
     ) except -1
 
-    cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil
+    cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil
 
     # Methods that allow modifications to stopping conditions
     cdef bint check_presplit_conditions(
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 7f21d5da545fb..1f3d164370b95 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -168,7 +168,7 @@ cdef class Splitter(BaseSplitter):
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
-        const unsigned char[::1] feature_has_missing,
+        const unsigned char[::1] missing_values_in_feature_mask,
     ) except -1:
         """Initialize the splitter.
 
@@ -245,7 +245,7 @@ cdef class Splitter(BaseSplitter):
             self.end
         )
 
-        if feature_has_missing is not None:
+        if missing_values_in_feature_mask is not None:
             self.criterion.init_sum_missing()
 
         return 0
@@ -280,7 +280,7 @@ cdef class Splitter(BaseSplitter):
 
         self.criterion.node_value(dest)
 
-    cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil:
+    cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil:
         """Copy the samples[start:end] into dest."""
         self.criterion.node_samples(dest)
 
@@ -903,19 +903,19 @@ cdef class DensePartitioner:
         cdef SIZE_t start
         cdef SIZE_t end
         cdef SIZE_t n_missing
-        cdef const unsigned char[::1] feature_has_missing
+        cdef const unsigned char[::1] missing_values_in_feature_mask
 
     def __init__(
         self,
         const DTYPE_t[:, :] X,
         SIZE_t[::1] samples,
         DTYPE_t[::1] feature_values,
-        const unsigned char[::1] feature_has_missing,
+        const unsigned char[::1] missing_values_in_feature_mask,
     ):
         self.X = X
         self.samples = samples
         self.feature_values = feature_values
-        self.feature_has_missing = feature_has_missing
+        self.missing_values_in_feature_mask = missing_values_in_feature_mask
 
     cdef inline void init_node_split(self, SIZE_t start, SIZE_t end) noexcept nogil:
         """Initialize splitter at the beginning of node_split."""
@@ -938,13 +938,13 @@ cdef class DensePartitioner:
             const DTYPE_t[:, :] X = self.X
             SIZE_t[::1] samples = self.samples
             SIZE_t n_missing = 0
-            const unsigned char[::1] feature_has_missing = self.feature_has_missing
+            const unsigned char[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask
 
         # Sort samples along that feature; by
         # copying the values into an array and
         # sorting the array in a manner which utilizes the cache more
         # effectively.
-        if feature_has_missing is not None and feature_has_missing[current_feature]:
+        if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]:
             i, current_end = self.start, self.end - 1
             # Missing values are placed at the end and do not participate in the sorting.
             while i <= current_end:
@@ -1113,7 +1113,7 @@ cdef class SparsePartitioner:
     cdef SIZE_t start
     cdef SIZE_t end
     cdef SIZE_t n_missing
-    cdef const unsigned char[::1] feature_has_missing
+    cdef const unsigned char[::1] missing_values_in_feature_mask
 
     cdef const DTYPE_t[::1] X_data
     cdef const INT32_t[::1] X_indices
@@ -1134,7 +1134,7 @@ cdef class SparsePartitioner:
         SIZE_t[::1] samples,
         SIZE_t n_samples,
         DTYPE_t[::1] feature_values,
-        const unsigned char[::1] feature_has_missing,
+        const unsigned char[::1] missing_values_in_feature_mask,
     ):
         if not isspmatrix_csc(X):
             raise ValueError("X should be in csc format")
@@ -1158,7 +1158,7 @@ cdef class SparsePartitioner:
         for p in range(n_samples):
             self.index_to_samples[samples[p]] = p
 
-        self.feature_has_missing = feature_has_missing
+        self.missing_values_in_feature_mask = missing_values_in_feature_mask
 
     cdef inline void init_node_split(self, SIZE_t start, SIZE_t end) noexcept nogil:
         """Initialize splitter at the beginning of node_split."""
@@ -1529,11 +1529,11 @@ cdef class BestSplitter(Splitter):
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
-        const unsigned char[::1] feature_has_missing,
+        const unsigned char[::1] missing_values_in_feature_mask,
     ) except -1:
-        Splitter.init(self, X, y, sample_weight, feature_has_missing)
+        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
         self.partitioner = DensePartitioner(
-            X, self.samples, self.feature_values, feature_has_missing
+            X, self.samples, self.feature_values, missing_values_in_feature_mask
         )
 
     cdef int node_split(self, double impurity, SplitRecord* split,
@@ -1555,11 +1555,11 @@ cdef class BestSparseSplitter(Splitter):
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
-        const unsigned char[::1] feature_has_missing,
+        const unsigned char[::1] missing_values_in_feature_mask,
     ) except -1:
-        Splitter.init(self, X, y, sample_weight, feature_has_missing)
+        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
         self.partitioner = SparsePartitioner(
-            X, self.samples, self.n_samples, self.feature_values, feature_has_missing
+            X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask
         )
 
     cdef int node_split(self, double impurity, SplitRecord* split,
@@ -1581,11 +1581,11 @@ cdef class RandomSplitter(Splitter):
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
-        const unsigned char[::1] feature_has_missing,
+        const unsigned char[::1] missing_values_in_feature_mask,
     ) except -1:
-        Splitter.init(self, X, y, sample_weight, feature_has_missing)
+        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
         self.partitioner = DensePartitioner(
-            X, self.samples, self.feature_values, feature_has_missing
+            X, self.samples, self.feature_values, missing_values_in_feature_mask
         )
 
     cdef int node_split(self, double impurity, SplitRecord* split,
@@ -1607,11 +1607,11 @@ cdef class RandomSparseSplitter(Splitter):
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
-        const unsigned char[::1] feature_has_missing,
+        const unsigned char[::1] missing_values_in_feature_mask,
     ) except -1:
-        Splitter.init(self, X, y, sample_weight, feature_has_missing)
+        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
         self.partitioner = SparsePartitioner(
-            X, self.samples, self.n_samples, self.feature_values, feature_has_missing
+            X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask
         )
 
     cdef int node_split(self, double impurity, SplitRecord* split,
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 94714cc33400c..828c99a2f4ea1 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -49,13 +49,6 @@ cdef class BaseTree:
     cdef SIZE_t value_stride             # The dimensionality of a vectorized output per sample
     cdef double* value                   # Array of values prediction values for each node
 
-    # Enables the use of tree to store distributions of the output to allow
-    # arbitrary usage of the the leaves. This is used in the quantile
-    # estimators for example.
-    # for storing samples at each leaf node with leaf's node ID as the key and
-    # the sample values as the value
-    cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples
-
     # Generic Methods: These are generic methods used by any tree.
     cdef int _resize(self, SIZE_t capacity) except -1 nogil
     cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil
@@ -121,9 +114,18 @@ cdef class Tree(BaseTree):
     cdef public SIZE_t n_outputs         # Number of outputs in y
     cdef public SIZE_t max_n_classes     # max(n_classes)
 
+    # Enables the use of tree to store distributions of the output to allow
+    # arbitrary usage of the the leaves. This is used in the quantile
+    # estimators for example.
+    # for storing samples at each leaf node with leaf's node ID as the key and
+    # the sample values as the value
+    cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples
+
     # Methods
     cdef cnp.ndarray _get_value_ndarray(self)
     cdef cnp.ndarray _get_node_ndarray(self)
+    cdef cnp.ndarray _get_value_samples_ndarray(self, SIZE_t node_id)
+    cdef cnp.ndarray _get_value_samples_keys(self)
 
     cpdef cnp.ndarray predict(self, object X)
 
@@ -146,7 +148,7 @@ cdef class TreeBuilder:
     cdef SIZE_t max_depth               # Maximal tree depth
     cdef double min_impurity_decrease   # Impurity threshold for early stopping
 
-    cdef unsigned char store_leaf_values # Whether to store leaf values
+    cdef unsigned char store_leaf_values    # Whether to store leaf values
 
     cpdef build(
         self,
@@ -154,7 +156,7 @@ cdef class TreeBuilder:
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight=*,
-        const unsigned char[::1] feature_has_missing=*,
+        const unsigned char[::1] missing_values_in_feature_mask=*,
     )
 
     cdef _check_input(
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 8ca98a64b42ab..1565ab441969d 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -100,7 +100,7 @@ cdef class TreeBuilder:
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight=None,
-        const unsigned char[::1] feature_has_missing=None,
+        const unsigned char[::1] missing_values_in_feature_mask=None,
     ):
         """Build a decision tree from the training set (X, y)."""
         pass
@@ -182,7 +182,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight=None,
-        const unsigned char[::1] feature_has_missing=None,
+        const unsigned char[::1] missing_values_in_feature_mask=None,
     ):
         """Build a decision tree from the training set (X, y)."""
 
@@ -208,7 +208,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef double min_impurity_decrease = self.min_impurity_decrease
 
         # Recursive partition (without actual recursion)
-        splitter.init(X, y, sample_weight, feature_has_missing)
+        splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
 
         cdef SIZE_t start
         cdef SIZE_t end
@@ -229,8 +229,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef SIZE_t max_depth_seen = -1
         cdef int rc = 0
 
-        cdef int node_idx
-
         cdef stack[StackRecord] builder_stack
         cdef StackRecord stack_record
 
@@ -319,11 +317,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         "impurity": split.impurity_left,
                         "n_constant_features": n_constant_features})
                 elif self.store_leaf_values and is_leaf:
-                    with gil:
-                        print('Storing leaf values...')
-
                     # copy leaf values to leaf_values array
-                    splitter.node_samples(&tree.value_samples[node_id])
+                    splitter.node_samples(tree.value_samples[node_id])
 
                 if depth > max_depth_seen:
                     max_depth_seen = depth
@@ -406,7 +401,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight=None,
-        const unsigned char[::1] feature_has_missing=None,
+        const unsigned char[::1] missing_values_in_feature_mask=None,
     ):
         """Build a decision tree from the training set (X, y)."""
 
@@ -418,7 +413,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes
 
         # Recursive partition (without actual recursion)
-        splitter.init(X, y, sample_weight, feature_has_missing)
+        splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
 
         cdef vector[FrontierRecord] frontier
         cdef FrontierRecord record
@@ -459,6 +454,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                     node.feature = _TREE_UNDEFINED
                     node.threshold = _TREE_UNDEFINED
 
+                    if self.store_leaf_values:
+                        # copy leaf values to leaf_values array
+                        splitter.node_samples(tree.value_samples[record.node_id])
                 else:
                     # Node is expandable
 
@@ -1321,6 +1319,14 @@ cdef class Tree(BaseTree):
     def value(self):
         return self._get_value_ndarray()[:self.node_count]
 
+    @property
+    def leaf_nodes_samples(self):
+        leaf_node_samples = dict()
+        keys = self._get_value_samples_keys()
+        for node_id in keys:
+            leaf_node_samples[node_id] = self._get_value_samples_ndarray(node_id)
+        return leaf_node_samples
+
     # TODO: Convert n_classes to cython.integral memory view once
     #  https://github.com/cython/cython/issues/5243 is fixed
     def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs):
@@ -1374,6 +1380,7 @@ cdef class Tree(BaseTree):
         d["node_count"] = self.node_count
         d["nodes"] = self._get_node_ndarray()
         d["values"] = self._get_value_ndarray()
+        d['value_samples'] = self.leaf_nodes_samples
         return d
 
     def __setstate__(self, d):
@@ -1407,6 +1414,35 @@ cdef class Tree(BaseTree):
         memcpy(self.value, cnp.PyArray_DATA(value_ndarray),
                self.capacity * self.value_stride * sizeof(double))
 
+        # store the leaf node samples if they exist
+        value_samples_dict = d['value_samples']
+        for node_id, leaf_samples in value_samples_dict.items():
+            self.value_samples[node_id].resize(leaf_samples.shape[0])
+            for idx in range(leaf_samples.shape[0]):
+                for jdx in range(leaf_samples.shape[1]):
+                    self.value_samples[node_id][idx].push_back(leaf_samples[idx, jdx])
+
+    cdef cnp.ndarray _get_value_samples_ndarray(self, SIZE_t node_id):
+        """Wraps value_samples as a 2-d NumPy array per node_id."""
+        cdef int i, j
+        cdef int n_samples = self.value_samples[node_id].size()
+        cdef cnp.ndarray[DOUBLE_t, ndim=2, mode='c'] leaf_node_samples = np.empty(shape=(n_samples, self.n_outputs), dtype=np.float64)
+
+        for i in range(n_samples):
+            for j in range(self.n_outputs):
+                leaf_node_samples[i, j] = self.value_samples[node_id][i][j]
+        return leaf_node_samples
+
+    cdef cnp.ndarray _get_value_samples_keys(self):
+        """Wraps value_samples keys as a 1-d NumPy array of keys."""
+        cdef cnp.ndarray[SIZE_t, ndim=1, mode='c'] keys = np.empty(len(self.value_samples), dtype=np.intp)
+        cdef unsigned int i = 0
+
+        for key in self.value_samples:
+            keys[i] = key.first
+            i += 1
+        return keys
+
     cdef cnp.ndarray _get_value_ndarray(self):
         """Wraps value as a 3-d NumPy array.
 
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index eefae6cdaa3f6..44a19b3dc0520 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -890,7 +890,7 @@ def test_pickle():
         else:
             X, y = diabetes.data, diabetes.target
 
-        est = TreeEstimator(random_state=0)
+        est = TreeEstimator(random_state=0, store_leaf_values=True)
         est.fit(X, y)
         score = est.score(X, y)
 
@@ -909,6 +909,7 @@ def test_pickle():
             "n_node_samples",
             "weighted_n_node_samples",
             "value",
+            "leaf_nodes_samples",
         ]
         fitted_attribute = {
             attribute: getattr(est.tree_, attribute) for attribute in attributes
@@ -923,14 +924,25 @@ def test_pickle():
             score == score2
         ), "Failed to generate same score  after pickling with {0}".format(name)
         for attribute in fitted_attribute:
-            assert_array_equal(
-                getattr(est2.tree_, attribute),
-                fitted_attribute[attribute],
-                err_msg=(
-                    f"Failed to generate same attribute {attribute} after pickling with"
-                    f" {name}"
-                ),
-            )
+            if attribute == "leaf_nodes_samples":
+                for key in fitted_attribute[attribute].keys():
+                    assert_array_equal(
+                        getattr(est2.tree_, attribute)[key],
+                        fitted_attribute[attribute][key],
+                        err_msg=(
+                            f"Failed to generate same attribute {attribute} after"
+                            f" pickling with {name}"
+                        ),
+                    )
+            else:
+                assert_array_equal(
+                    getattr(est2.tree_, attribute),
+                    fitted_attribute[attribute],
+                    err_msg=(
+                        f"Failed to generate same attribute {attribute} after pickling"
+                        f" with {name}"
+                    ),
+                )
 
 
 def test_multioutput():
@@ -2634,3 +2646,148 @@ def test_sample_weight_non_uniform(make_data, Tree):
     tree_samples_removed.fit(X[1::2, :], y[1::2])
 
     assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X))
+
+
+@pytest.mark.parametrize(
+    "tree_name",
+    ALL_TREES,
+)
+def test_leaf_node_samples(tree_name):
+    """Test getting leaf node samples from fitted tree."""
+    tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=False)
+    tree.fit(X_small, y_small)
+
+    # Check that the leaf node samples are not stored by default
+    assert tree.tree_.leaf_nodes_samples == dict()
+
+    # error should be raised if trying to predict quantiles
+    assert hasattr(tree, "predict_quantiles")
+    for meth in ["predict_quantiles", "get_leaf_node_samples"]:
+        if hasattr(tree, meth):
+            with pytest.raises(
+                RuntimeError,
+                match="leaf node samples",
+            ):
+                getattr(tree, meth)(X_small)
+
+    quantile_tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=True)
+    quantile_tree.fit(X_small, y_small)
+
+    score = tree.score(X_small, y_small)
+    new_score = quantile_tree.score(X_small, y_small)
+    assert np.isclose(score, new_score)
+
+    # Check that the leaf node samples are what they should be
+    X_leaves = quantile_tree.apply(X_small)
+    for idx in range(X_leaves.shape[0]):
+        leaf_idx = X_leaves[idx]
+        assert y_small[idx] in quantile_tree.tree_.leaf_nodes_samples[leaf_idx]
+    assert set(np.unique(X_leaves)) == set(
+        quantile_tree.tree_.leaf_nodes_samples.keys()
+    )
+
+
+@pytest.mark.parametrize(
+    "name",
+    ALL_TREES,
+)
+def test_quantile_tree_predict(name):
+    TreeEstimator = ALL_TREES[name]
+
+    # test quantile prediction
+    est = TreeEstimator(store_leaf_values=True, random_state=0)
+
+    # fit on binary results in perfect leaves, so all quantiles are the same
+    est.fit(X_small, y_small)
+    pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9])
+    assert_array_equal(est.predict(X_small), pred[:, 0])
+    assert_array_equal(est.predict(X_small), pred[:, 1])
+    assert_array_equal(est.predict(X_small), pred[:, 2])
+    assert_array_equal(pred[:, 0], y_small)
+    assert np.unique(pred, axis=1).shape[1] == 1
+
+    est.fit(X_small[:-5], y_small[:-5])
+    held_out_X = X_small[-5:, :]
+    pred = est.predict_quantiles(held_out_X, quantiles=[0.1, 0.5, 0.9])
+    assert_array_equal(est.predict(held_out_X), pred[:, 0])
+    assert_array_equal(est.predict(held_out_X), pred[:, 1])
+    assert_array_equal(est.predict(held_out_X), pred[:, 2])
+
+    # fit on real data
+    est.fit(iris.data, iris.target)
+    pred = est.predict_quantiles(iris.data, quantiles=[0.1, 0.5, 0.9])
+    assert_array_equal(pred[:, 0], iris.target)
+    assert_array_equal(pred[:, 1], iris.target)
+    assert_array_equal(pred[:, 2], iris.target)
+
+
+@pytest.mark.parametrize(
+    "name",
+    ALL_TREES,
+)
+def test_quantile_tree_predict_impure_leaves(name):
+    TreeEstimator = ALL_TREES[name]
+
+    # test quantile prediction
+    est = TreeEstimator(store_leaf_values=True, random_state=0, max_depth=4)
+    # fit on binary results with constrained depth will result in impure leaves
+    est.fit(X_small, y_small)
+    pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9])
+    assert np.unique(pred, axis=1).shape[1] > 1
+
+
+def test_multioutput_quantiles():
+    # Check estimators on multi-output problems.
+    X = [
+        [-2, -1],
+        [-1, -1],
+        [-1, -2],
+        [1, 1],
+        [1, 2],
+        [2, 1],
+        [-2, 1],
+        [-1, 1],
+        [-1, 2],
+        [2, -1],
+        [1, -1],
+        [1, -2],
+    ]
+
+    y = [
+        [-1, 0],
+        [-1, 0],
+        [-1, 0],
+        [1, 1],
+        [1, 1],
+        [1, 1],
+        [-1, 2],
+        [-1, 2],
+        [-1, 2],
+        [1, 3],
+        [1, 3],
+        [1, 3],
+    ]
+
+    T = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
+    y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]]
+
+    # toy classification problem
+    for name, TreeClassifier in CLF_TREES.items():
+        clf = TreeClassifier(random_state=0, store_leaf_values=True)
+        clf.fit(X, y)
+
+        y_hat = clf.predict_quantiles(T, quantiles=[0.25, 0.5, 0.75])
+        y_hat = y_hat.squeeze()
+        assert_array_equal(y_hat[:, 0], y_true)
+        assert_array_equal(y_hat[:, 1], y_true)
+        assert_array_equal(y_hat[:, 2], y_true)
+        assert y_hat.shape == (4, 3, 2)
+
+    # toy regression problem
+    for name, TreeRegressor in REG_TREES.items():
+        reg = TreeRegressor(random_state=0, store_leaf_values=True)
+        y_hat = reg.fit(X, y).predict_quantiles(T, quantiles=[0.25, 0.5, 0.75])
+        assert_array_equal(y_hat[:, 0], y_true)
+        assert_array_equal(y_hat[:, 1], y_true)
+        assert_array_equal(y_hat[:, 2], y_true)
+        assert y_hat.shape == (4, 3, 2)
diff --git a/sklearn/utils/_metadata_requests.py b/sklearn/utils/_metadata_requests.py
index 82b3eec69b461..a1cd934c13756 100644
--- a/sklearn/utils/_metadata_requests.py
+++ b/sklearn/utils/_metadata_requests.py
@@ -241,8 +241,14 @@ def add_request(
         if alias == param:
             alias = True
 
-        if alias == UNUSED and param in self._requests:
-            del self._requests[param]
+        if alias == UNUSED:
+            if param in self._requests:
+                del self._requests[param]
+            else:
+                raise ValueError(
+                    f"Trying to remove parameter {param} with UNUSED which doesn't"
+                    " exist."
+                )
         else:
             self._requests[param] = alias
 
@@ -1155,7 +1161,7 @@ def _build_request_for_signature(cls, router, method):
         # ignore the first parameter of the method, which is usually "self"
         params = list(inspect.signature(getattr(cls, method)).parameters.items())[1:]
         for pname, param in params:
-            if pname in {"X", "y", "Y"}:
+            if pname in {"X", "y", "Y", "Xt", "yt"}:
                 continue
             if param.kind in {param.VAR_POSITIONAL, param.VAR_KEYWORD}:
                 continue
diff --git a/sklearn/utils/_plotting.py b/sklearn/utils/_plotting.py
index cc301b509e386..c0671046c9cd4 100644
--- a/sklearn/utils/_plotting.py
+++ b/sklearn/utils/_plotting.py
@@ -1,3 +1,5 @@
+import numpy as np
+
 from . import check_consistent_length, check_matplotlib_support
 from .multiclass import type_of_target
 from .validation import _check_pos_label_consistency
@@ -56,3 +58,41 @@ def _validate_from_predictions_params(
         name = name if name is not None else "Classifier"
 
         return pos_label, name
+
+
+def _validate_score_name(score_name, scoring, negate_score):
+    """Validate the `score_name` parameter.
+
+    If `score_name` is provided, we just return it as-is.
+    If `score_name` is `None`, we use `Score` if `negate_score` is `False` and
+    `Negative score` otherwise.
+    If `score_name` is a string or a callable, we infer the name. We replace `_` by
+    spaces and capitalize the first letter. We remove `neg_` and replace it by
+    `"Negative"` if `negate_score` is `False` or just remove it otherwise.
+    """
+    if score_name is not None:
+        return score_name
+    elif scoring is None:
+        return "Negative score" if negate_score else "Score"
+    else:
+        score_name = scoring.__name__ if callable(scoring) else scoring
+        if negate_score:
+            if score_name.startswith("neg_"):
+                score_name = score_name[4:]
+            else:
+                score_name = f"Negative {score_name}"
+        elif score_name.startswith("neg_"):
+            score_name = f"Negative {score_name[4:]}"
+        score_name = score_name.replace("_", " ")
+        return score_name.capitalize()
+
+
+def _interval_max_min_ratio(data):
+    """Compute the ratio between the largest and smallest inter-point distances.
+
+    A value larger than 5 typically indicates that the parameter range would
+    better be displayed with a log scale while a linear scale would be more
+    suitable otherwise.
+    """
+    diff = np.diff(np.sort(data))
+    return diff.max() / diff.min()
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index cb1e0f2b1fa4d..7d8e673210ff7 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -4424,7 +4424,7 @@ def _output_from_fit_transform(transformer, name, X, df, y):
     return outputs
 
 
-def _check_generated_dataframe(name, case, outputs_default, outputs_pandas):
+def _check_generated_dataframe(name, case, index, outputs_default, outputs_pandas):
     import pandas as pd
 
     X_trans, feature_names_default = outputs_default
@@ -4434,7 +4434,12 @@ def _check_generated_dataframe(name, case, outputs_default, outputs_pandas):
     # We always rely on the output of `get_feature_names_out` of the
     # transformer used to generate the dataframe as a ground-truth of the
     # columns.
-    expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas, copy=False)
+    # If a dataframe is passed into transform, then the output should have the same
+    # index
+    expected_index = index if case.endswith("df") else None
+    expected_dataframe = pd.DataFrame(
+        X_trans, columns=feature_names_pandas, copy=False, index=expected_index
+    )
 
     try:
         pd.testing.assert_frame_equal(df_trans, expected_dataframe)
@@ -4469,7 +4474,8 @@ def check_set_output_transform_pandas(name, transformer_orig):
     set_random_state(transformer)
 
     feature_names_in = [f"col{i}" for i in range(X.shape[1])]
-    df = pd.DataFrame(X, columns=feature_names_in, copy=False)
+    index = [f"index{i}" for i in range(X.shape[0])]
+    df = pd.DataFrame(X, columns=feature_names_in, copy=False, index=index)
 
     transformer_default = clone(transformer).set_output(transform="default")
     outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y)
@@ -4483,7 +4489,7 @@ def check_set_output_transform_pandas(name, transformer_orig):
 
     for case in outputs_default:
         _check_generated_dataframe(
-            name, case, outputs_default[case], outputs_pandas[case]
+            name, case, index, outputs_default[case], outputs_pandas[case]
         )
 
 
@@ -4511,7 +4517,8 @@ def check_global_ouptut_transform_pandas(name, transformer_orig):
     set_random_state(transformer)
 
     feature_names_in = [f"col{i}" for i in range(X.shape[1])]
-    df = pd.DataFrame(X, columns=feature_names_in, copy=False)
+    index = [f"index{i}" for i in range(X.shape[0])]
+    df = pd.DataFrame(X, columns=feature_names_in, copy=False, index=index)
 
     transformer_default = clone(transformer).set_output(transform="default")
     outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y)
@@ -4528,5 +4535,5 @@ def check_global_ouptut_transform_pandas(name, transformer_orig):
 
     for case in outputs_default:
         _check_generated_dataframe(
-            name, case, outputs_default[case], outputs_pandas[case]
+            name, case, index, outputs_default[case], outputs_pandas[case]
         )
diff --git a/sklearn/utils/tests/test_param_validation.py b/sklearn/utils/tests/test_param_validation.py
index 528a667a3f58e..022f9f373a049 100644
--- a/sklearn/utils/tests/test_param_validation.py
+++ b/sklearn/utils/tests/test_param_validation.py
@@ -6,6 +6,7 @@
 
 from sklearn._config import config_context, get_config
 from sklearn.base import BaseEstimator
+from sklearn.base import _fit_context
 from sklearn.model_selection import LeaveOneOut
 from sklearn.utils import deprecated
 from sklearn.utils._param_validation import Hidden
@@ -60,8 +61,9 @@ class _Estimator(BaseEstimator):
     def __init__(self, a):
         self.a = a
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X=None, y=None):
-        self._validate_params()
+        pass
 
 
 @pytest.mark.parametrize("interval_type", [Integral, Real])
diff --git a/sklearn/utils/tests/test_plotting.py b/sklearn/utils/tests/test_plotting.py
new file mode 100644
index 0000000000000..00b1f7f74fcd0
--- /dev/null
+++ b/sklearn/utils/tests/test_plotting.py
@@ -0,0 +1,63 @@
+import numpy as np
+import pytest
+
+from sklearn.utils._plotting import _validate_score_name, _interval_max_min_ratio
+
+
+def metric():
+    pass  # pragma: no cover
+
+
+def neg_metric():
+    pass  # pragma: no cover
+
+
+@pytest.mark.parametrize(
+    "score_name, scoring, negate_score, expected_score_name",
+    [
+        ("accuracy", None, False, "accuracy"),  # do not transform the name
+        (None, "accuracy", False, "Accuracy"),  # capitalize the name
+        (None, "accuracy", True, "Negative accuracy"),  # add "Negative"
+        (None, "neg_mean_absolute_error", False, "Negative mean absolute error"),
+        (None, "neg_mean_absolute_error", True, "Mean absolute error"),  # remove "neg_"
+        ("MAE", "neg_mean_absolute_error", True, "MAE"),  # keep score_name
+        (None, None, False, "Score"),  # default name
+        (None, None, True, "Negative score"),  # default name but negated
+        ("Some metric", metric, False, "Some metric"),  # do not transform the name
+        ("Some metric", metric, True, "Some metric"),  # do not transform the name
+        (None, metric, False, "Metric"),  # default name
+        (None, metric, True, "Negative metric"),  # default name but negated
+        ("Some metric", neg_metric, False, "Some metric"),  # do not transform the name
+        ("Some metric", neg_metric, True, "Some metric"),  # do not transform the name
+        (None, neg_metric, False, "Negative metric"),  # default name
+        (None, neg_metric, True, "Metric"),  # default name but negated
+    ],
+)
+def test_validate_score_name(score_name, scoring, negate_score, expected_score_name):
+    """Check that we return the right score name."""
+    assert (
+        _validate_score_name(score_name, scoring, negate_score) == expected_score_name
+    )
+
+
+# In the following test, we check the value of the max to min ratio
+# for parameter value intervals to check that using a decision threshold
+# of 5. is a good heuristic to decide between linear and log scales on
+# common ranges of parameter values.
+@pytest.mark.parametrize(
+    "data, lower_bound, upper_bound",
+    [
+        # Such a range could be clearly displayed with either log scale or linear
+        # scale.
+        (np.geomspace(0.1, 1, 5), 5, 6),
+        # Checking that the ratio is still positive on a negative log scale.
+        (-np.geomspace(0.1, 1, 10), 7, 8),
+        # Evenly spaced parameter values lead to a ratio of 1.
+        (np.linspace(0, 1, 5), 0.9, 1.1),
+        # This is not exactly spaced on a log scale but we will benefit from treating
+        # it as such for visualization.
+        ([1, 2, 5, 10, 20, 50], 20, 40),
+    ],
+)
+def test_inverval_max_min_ratio(data, lower_bound, upper_bound):
+    assert lower_bound < _interval_max_min_ratio(data) < upper_bound
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 4a765d1404794..2d39279f81745 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -42,6 +42,7 @@
 from sklearn.utils import _safe_indexing
 from sklearn.utils.validation import (
     has_fit_parameter,
+    _is_fitted,
     check_is_fitted,
     check_consistent_length,
     assert_all_finite,
@@ -848,23 +849,32 @@ def fit(self, X, y):
     msg = "not fitted"
     est = MyEstimator()
 
+    assert not _is_fitted(est, attributes=["a_", "b_"])
     with pytest.raises(NotFittedError, match=msg):
         check_is_fitted(est, attributes=["a_", "b_"])
+    assert not _is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
     with pytest.raises(NotFittedError, match=msg):
         check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
+    assert not _is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
     with pytest.raises(NotFittedError, match=msg):
         check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
 
     est.a_ = "a"
+    assert not _is_fitted(est, attributes=["a_", "b_"])
     with pytest.raises(NotFittedError, match=msg):
         check_is_fitted(est, attributes=["a_", "b_"])
+    assert not _is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
     with pytest.raises(NotFittedError, match=msg):
         check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
+    assert _is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
     check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
 
     est.b_ = "b"
+    assert _is_fitted(est, attributes=["a_", "b_"])
     check_is_fitted(est, attributes=["a_", "b_"])
+    assert _is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
     check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
+    assert _is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
     check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
 
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 6179d91c2a491..8ceef15986567 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1369,6 +1369,44 @@ def check_symmetric(array, *, tol=1e-10, raise_warning=True, raise_exception=Fal
     return array
 
 
+def _is_fitted(estimator, attributes=None, all_or_any=all):
+    """Determine if an estimator is fitted
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        Estimator instance for which the check is performed.
+
+    attributes : str, list or tuple of str, default=None
+        Attribute name(s) given as string or a list/tuple of strings
+        Eg.: ``["coef_", "estimator_", ...], "coef_"``
+
+        If `None`, `estimator` is considered fitted if there exist an
+        attribute that ends with a underscore and does not start with double
+        underscore.
+
+    all_or_any : callable, {all, any}, default=all
+        Specify whether all or any of the given attributes must exist.
+
+    Returns
+    -------
+    fitted : bool
+        Whether the estimator is fitted.
+    """
+    if attributes is not None:
+        if not isinstance(attributes, (list, tuple)):
+            attributes = [attributes]
+        return all_or_any([hasattr(estimator, attr) for attr in attributes])
+
+    if hasattr(estimator, "__sklearn_is_fitted__"):
+        return estimator.__sklearn_is_fitted__()
+
+    fitted_attrs = [
+        v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")
+    ]
+    return len(fitted_attrs) > 0
+
+
 def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all):
     """Perform is_fitted validation for estimator.
 
@@ -1425,18 +1463,7 @@ def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all):
     if not hasattr(estimator, "fit"):
         raise TypeError("%s is not an estimator instance." % (estimator))
 
-    if attributes is not None:
-        if not isinstance(attributes, (list, tuple)):
-            attributes = [attributes]
-        fitted = all_or_any([hasattr(estimator, attr) for attr in attributes])
-    elif hasattr(estimator, "__sklearn_is_fitted__"):
-        fitted = estimator.__sklearn_is_fitted__()
-    else:
-        fitted = [
-            v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")
-        ]
-
-    if not fitted:
+    if not _is_fitted(estimator, attributes, all_or_any):
         raise NotFittedError(msg % {"name": type(estimator).__name__})
 
 
From 855ee192407d19b51adb4f50a49c6752ee80c820 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 15 Jun 2023 20:32:20 -0400
Subject: [PATCH 15/39] Add quantile

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 2 +-
 sklearn/tree/_classes.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index e715952947c04..b43bbeaf0b435 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -730,7 +730,7 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
             The quantiles at which to evaluate, by default 0.5 (median).
         method : str, optional
             The method to interpolate, by default 'linear'. Can be any keyword
-            argument accepted by :func:`np.quantile`.
+            argument accepted by :func:`~np.quantile`.
         check_input : bool, optional
             Whether or not to check input, by default True.
 
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 64a444db0b228..d7d8cedb63696 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -641,7 +641,7 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest", check_input=True
             The quantiles at which to evaluate, by default 0.5 (median).
         method : str, optional
             The method to interpolate, by default 'linear'. Can be any keyword
-            argument accepted by :func:`np.quantile`.
+            argument accepted by :func:`~np.quantile`.
         check_input : bool, optional
             Whether or not to check input, by default True.
 

From 3f5cb6597e36a08f651f8f0eb7324e9658a14bea Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 16 Jun 2023 11:05:43 -0400
Subject: [PATCH 16/39] Add check input

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 2 --
 sklearn/tree/_classes.py    | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index b43bbeaf0b435..c51c489dbd5dd 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -731,8 +731,6 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
         method : str, optional
             The method to interpolate, by default 'linear'. Can be any keyword
             argument accepted by :func:`~np.quantile`.
-        check_input : bool, optional
-            Whether or not to check input, by default True.
 
         Returns
         -------
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index d7d8cedb63696..78454b8854d26 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -602,6 +602,8 @@ def get_leaf_node_samples(self, X, check_input=True):
         ----------
         X : array-like of shape (n_samples, n_features)
             Dataset to apply the forest to.
+        check_input : bool, default=True
+            Allow to bypass several input checking.
 
         Returns
         -------

From 7401ddcb19a42132cf46e79a14b22a2bdfb8519c Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 16 Jun 2023 18:35:39 -0400
Subject: [PATCH 17/39] Try to fix docstring

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 78454b8854d26..c75c933c49b39 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -607,12 +607,11 @@ def get_leaf_node_samples(self, X, check_input=True):
 
         Returns
         -------
-        leaf_nodes_samples : a list of array-like of shape
-                (n_leaf_node_samples, n_outputs)
+        leaf_nodes_samples : a list of array-like
             Each sample is represented by the indices of the training samples that
             reached the leaf node. The ``n_leaf_node_samples`` may vary between
             samples, since the number of samples that fall in a leaf node is
-            variable.
+            variable. Each array has shape (n_leaf_node_samples, n_outputs).
         """
         if not self.store_leaf_values:
             raise RuntimeError(

From 13e29135bd0b640f3bf325ec40a22a879096b719 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 16 Jun 2023 18:41:17 -0400
Subject: [PATCH 18/39] Try to fix docstring

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index c75c933c49b39..2d83a94dc8ec1 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -1167,7 +1167,6 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         self : DecisionTreeClassifier
             Fitted estimator.
         """
-
         super()._fit(
             X,
             y,

From 43aa3ef51ca96b58b00a178954d033579db09de9 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Sat, 17 Jun 2023 10:41:44 -0400
Subject: [PATCH 19/39] Fix docstring

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index c51c489dbd5dd..5482ebcaf1d41 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -817,12 +817,11 @@ def get_leaf_node_samples(self, X):
 
         Returns
         -------
-        leaf_node_samples : a list of array-like of shape
-                (n_leaf_node_samples, n_outputs)
+        leaf_node_samples : a list of array-like
             Each sample is represented by the indices of the training samples that
             reached the leaf node. The ``n_leaf_node_samples`` may vary between
             samples, since the number of samples that fall in a leaf node is
-            variable.
+            variable. Each array-like has shape (n_leaf_node_samples, n_outputs).
         """
         check_is_fitted(self)
         # Check data

From fe3072f4ee28f49d590e7b437bf01bffd61ab917 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Sat, 17 Jun 2023 11:01:09 -0400
Subject: [PATCH 20/39] Fix docstring

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 5482ebcaf1d41..9fd3af21b1fd9 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -696,7 +696,6 @@ def _bin_data(self, X, is_training_data):
         If is_training_data, then fit the _bin_mapper attribute.
         Else, the binned data is converted to a C-contiguous array.
         """
-
         description = "training" if is_training_data else "validation"
         if self.verbose:
             print(

From 2d4de9aff7567bf796626aed4f27149f6ccf399c Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 19 Jun 2023 21:33:55 -0400
Subject: [PATCH 21/39] Fix the predict quantiles docstring

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 9fd3af21b1fd9..f85efb0b0a43b 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -733,9 +733,9 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
 
         Returns
         -------
-        y : ndarray of shape (n_samples, n_quantiles) or
-                (n_samples, n_quantiles, n_outputs)
-            The predicted values.
+        y : ndarray of shape (n_samples, n_quantiles, [n_output])
+            The predicted values. The ``n_outputs`` dimension is present only
+            for multi-output regressors.
         """
         if not self.store_leaf_values:
             raise RuntimeError(

From 1c1ec8cff3a181b7a86a4df8a2aeb01fa7cdbe6a Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 19 Jun 2023 21:35:33 -0400
Subject: [PATCH 22/39] Fix the predict quantiles docstring

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index f85efb0b0a43b..3eb61c9497918 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -733,7 +733,7 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
 
         Returns
         -------
-        y : ndarray of shape (n_samples, n_quantiles, [n_output])
+        y : ndarray of shape (n_samples, n_quantiles, [n_outputs])
             The predicted values. The ``n_outputs`` dimension is present only
             for multi-output regressors.
         """

From 4bc651dd7916d7c267690ef0c9705b3f2d69c9d0 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 23 Jun 2023 12:02:45 -0400
Subject: [PATCH 23/39] Remove some diff

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py        |  1 -
 sklearn/tree/_criterion.pyx     | 18 ++++++++++++++++++
 sklearn/tree/_tree.pxd          |  3 ++-
 sklearn/tree/_tree.pyx          |  2 --
 sklearn/tree/tests/test_tree.py |  8 +++++---
 5 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 74e60c64ce85f..e61f674d300c9 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -511,7 +511,6 @@ def _build_tree(
                 self.min_impurity_decrease,
                 self.store_leaf_values,
             )
-
         builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
 
         if self.n_outputs_ == 1 and is_classifier(self):
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 178a9adee9e80..2ddc02194c490 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -155,8 +155,10 @@ cdef class BaseCriterion:
 
         This method computes the improvement in impurity when a split occurs.
         The weighted impurity improvement equation is the following:
+
             N_t / N * (impurity - N_t_R / N_t * right_impurity
                                 - N_t_L / N_t * left_impurity)
+
         where N is the total number of samples, N_t is the number of samples
         at the current node, N_t_L is the number of samples in the left child,
         and N_t_R is the number of samples in the right child,
@@ -165,8 +167,10 @@ cdef class BaseCriterion:
         ----------
         impurity_parent : double
             The initial impurity of the parent node before the split
+
         impurity_left : double
             The impurity of the left child
+
         impurity_right : double
             The impurity of the right child
 
@@ -611,10 +615,13 @@ cdef class Entropy(ClassificationCriterion):
     This handles cases where the target is a classification taking values
     0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations,
     then let
+
         count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k)
+
     be the proportion of class k observations in node m.
 
     The cross-entropy is then defined as
+
         cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k)
     """
 
@@ -1058,10 +1065,14 @@ cdef class MSE(RegressionCriterion):
 
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
+
         The MSE proxy is derived from
+
             sum_{i left}(y_i - y_pred_L)^2 + sum_{i right}(y_i - y_pred_R)^2
             = sum(y_i^2) - n_L * mean_{i left}(y_i)^2 - n_R * mean_{i right}(y_i)^2
+
         Neglecting constant terms, this gives:
+
             - 1/n_L * sum_{i left}(y_i)^2 - 1/n_R * sum_{i right}(y_i)^2
         """
         cdef SIZE_t k
@@ -1139,6 +1150,7 @@ cdef class MAE(RegressionCriterion):
         ----------
         n_outputs : SIZE_t
             The number of targets to be predicted
+
         n_samples : SIZE_t
             The total number of samples to fit on
         """
@@ -1429,6 +1441,7 @@ cdef class FriedmanMSE(MSE):
     """Mean squared error impurity criterion with improvement score by Friedman.
 
     Uses the formula (35) in Friedman's original Gradient Boosting paper:
+
         diff = mean_left - mean_right
         improvement = n_left * n_right * diff^2 / (n_left + n_right)
     """
@@ -1483,6 +1496,7 @@ cdef class Poisson(RegressionCriterion):
     """Half Poisson deviance as impurity criterion.
 
     Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true)
+    
     Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)`
     at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the
     implemented impurity (factor 2 is skipped):
@@ -1519,12 +1533,16 @@ cdef class Poisson(RegressionCriterion):
 
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
+
         The Poisson proxy is derived from:
+
               sum_{i left }(y_i * log(y_i / y_pred_L))
             + sum_{i right}(y_i * log(y_i / y_pred_R))
             = sum(y_i * log(y_i) - n_L * mean_{i left}(y_i) * log(mean_{i left}(y_i))
                                  - n_R * mean_{i right}(y_i) * log(mean_{i right}(y_i))
+
         Neglecting constant terms, this gives
+
             - sum{i left }(y_i) * log(mean{i left}(y_i))
             - sum{i right}(y_i) * log(mean{i right}(y_i))
         """
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 7b933d905c79a..dedd820c41e0f 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -141,7 +141,8 @@ cdef class TreeBuilder:
     # This class controls the various stopping criteria and the node splitting
     # evaluation order, e.g. depth-first or best-first.
 
-    cdef Splitter splitter
+    cdef Splitter splitter              # Splitting algorithm
+
     cdef SIZE_t min_samples_split       # Minimum number of samples in an internal node
     cdef SIZE_t min_samples_leaf        # Minimum number of samples in a leaf
     cdef double min_weight_leaf         # Minimum weight in a leaf
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 24b01b96aa726..c44022f54d3a5 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -61,7 +61,6 @@ cdef extern from "<stack>" namespace "std" nogil:
 from numpy import float32 as DTYPE
 from numpy import float64 as DOUBLE
 
-
 cdef double INFINITY = np.inf
 cdef double EPSILON = np.finfo('double').eps
 
@@ -87,7 +86,6 @@ NODE_DTYPE = np.asarray(<Node[:1]>(&dummy)).dtype
 # TreeBuilder
 # =============================================================================
 
-
 cdef class TreeBuilder:
     """Interface for different tree building strategies."""
 
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 792ba44b1302e..9be3dbd6f549e 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -33,13 +33,15 @@
     DENSE_SPLITTERS,
     SPARSE_SPLITTERS,
 )
-from sklearn.tree._tree import NODE_DTYPE, TREE_LEAF, TREE_UNDEFINED
-from sklearn.tree._tree import Tree as CythonTree
 from sklearn.tree._tree import (
+    NODE_DTYPE,
+    TREE_LEAF,
+    TREE_UNDEFINED,
     _check_n_classes,
     _check_node_ndarray,
     _check_value_ndarray,
 )
+from sklearn.tree._tree import Tree as CythonTree
 from sklearn.utils import _IS_32BIT, compute_sample_weight
 from sklearn.utils._testing import (
     assert_almost_equal,
@@ -2424,7 +2426,7 @@ def test_missing_values_on_equal_nodes_no_missing(criterion):
     X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T
     y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6])
 
-    dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion)
+    dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True)
     dtc.fit(X, y)
 
     # Goes to right node because it has the most data points

From cc035d04b9784e6facb7096a56c9c81801d819ec Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 23 Jun 2023 15:42:08 -0400
Subject: [PATCH 24/39] Fix regression error

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py     | 18 +++++++++---------
 sklearn/tree/_criterion.pyx     |  3 ++-
 sklearn/tree/_splitter.pyx      |  6 ++++++
 sklearn/tree/tests/test_tree.py |  4 +++-
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index d8a94940799c0..f2e0201d534cd 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -40,27 +40,28 @@ class calls the ``fit`` method of each sub-estimator on random samples
 # License: BSD 3 clause
 
 
-from time import time
 import threading
 from abc import ABCMeta, abstractmethod
 from numbers import Integral, Real
+from time import time
 from warnings import catch_warnings, simplefilter, warn
 
 import numpy as np
 from scipy.sparse import hstack as sparse_hstack
 from scipy.sparse import issparse
 
-from sklearn.base import is_classifier, _fit_context
 from sklearn.base import (
     ClassifierMixin,
     MultiOutputMixin,
     RegressorMixin,
     TransformerMixin,
+    _fit_context,
+    is_classifier,
 )
-
-from sklearn.metrics import accuracy_score, r2_score
-from sklearn.preprocessing import OneHotEncoder
+from sklearn.ensemble._base import BaseEnsemble, _partition_estimators
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
 from sklearn.exceptions import DataConversionWarning
+from sklearn.metrics import accuracy_score, r2_score
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.tree import (
     BaseDecisionTree,
@@ -69,8 +70,8 @@ class calls the ``fit`` method of each sub-estimator on random samples
     ExtraTreeClassifier,
     ExtraTreeRegressor,
 )
-from ..tree._tree import DOUBLE, DTYPE
 from sklearn.utils import check_random_state, compute_sample_weight
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
 from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions
 from sklearn.utils.multiclass import check_classification_targets, type_of_target
 from sklearn.utils.parallel import Parallel, delayed
@@ -80,9 +81,8 @@ class calls the ``fit`` method of each sub-estimator on random samples
     _num_samples,
     check_is_fitted,
 )
-from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
-from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
-from sklearn.ensemble._base import BaseEnsemble, _partition_estimators
+
+from ..tree._tree import DOUBLE, DTYPE
 
 __all__ = [
     "RandomForestClassifier",
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 2ddc02194c490..bd1bdef0a6a93 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1496,10 +1496,11 @@ cdef class Poisson(RegressionCriterion):
     """Half Poisson deviance as impurity criterion.
 
     Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true)
-    
+
     Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)`
     at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the
     implemented impurity (factor 2 is skipped):
+
         1/n * sum(y_true * log(y_true/y_pred)
     """
     # FIXME in 1.0:
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 007d55a589df7..bca38d5f04374 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -507,6 +507,12 @@ cdef inline int node_split_best(
                 current_split.pos = p
 
                 # Reject if min_samples_leaf is not guaranteed
+                if missing_go_to_left:
+                    n_left = current_split.pos - splitter.start + n_missing
+                    n_right = end_non_missing - current_split.pos
+                else:
+                    n_left = current_split.pos - splitter.start
+                    n_right = end_non_missing - current_split.pos + n_missing
                 if splitter.check_presplit_conditions(current_split, n_missing, missing_go_to_left) == 1:
                     continue
 
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 9be3dbd6f549e..0ce7a548c7bdb 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -2426,7 +2426,9 @@ def test_missing_values_on_equal_nodes_no_missing(criterion):
     X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T
     y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6])
 
-    dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True)
+    dtc = DecisionTreeRegressor(
+        random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True
+    )
     dtc.fit(X, y)
 
     # Goes to right node because it has the most data points

From 4840d4e3e3ef6175c4e1197c87c77f8fe06f10cf Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 23 Jun 2023 18:26:04 -0400
Subject: [PATCH 25/39] Fix boolean

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 2 +-
 sklearn/tree/_classes.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index f2e0201d534cd..b3feec10a3072 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -221,7 +221,7 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
             None,
             Interval(Integral, 1, None, closed="left"),
         ],
-        "store_leaf_values": [bool],
+        "store_leaf_values": ["boolean"],
     }
 
     @abstractmethod
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index e61f674d300c9..6825c36df155c 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -123,7 +123,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
         "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None],
         "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")],
         "ccp_alpha": [Interval(Real, 0.0, None, closed="left")],
-        "store_leaf_values": [bool],
+        "store_leaf_values": ["boolean"],
     }
 
     @abstractmethod

From fdf2e2dbe1e1c316a1e2987aea31da26ebbec2cd Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 30 Jun 2023 12:49:16 -0700
Subject: [PATCH 26/39] Added doc to store_leaf_values

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index b3feec10a3072..34bebab399566 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -822,6 +822,11 @@ def get_leaf_node_samples(self, X):
             samples, since the number of samples that fall in a leaf node is
             variable. Each array-like has shape (n_leaf_node_samples, n_outputs).
         """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "Leaf node samples are not available when store_leaf_values=False"
+            )
+
         check_is_fitted(self)
         # Check data
         X = self._validate_X_predict(X)
@@ -1520,6 +1525,9 @@ class RandomForestClassifier(ForestClassifier):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier`
@@ -1879,6 +1887,9 @@ class RandomForestRegressor(ForestRegressor):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.DecisionTreeRegressor`
@@ -2232,6 +2243,9 @@ class ExtraTreesClassifier(ForestClassifier):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.ExtraTreesClassifier`
@@ -2576,6 +2590,9 @@ class ExtraTreesRegressor(ForestRegressor):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor`

From 5b7ce7e1c6842aac174ebc4b1b2a68a1f1e25a7d Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 30 Jun 2023 12:51:20 -0700
Subject: [PATCH 27/39] Merging main

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 6825c36df155c..200f87b0b9ef3 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -1386,6 +1386,16 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the samples that fall into leaves in the ``tree_`` attribute.
+        Each leaf will store a 2D array corresponding to the samples that fall into it
+        keyed by node_id.
+
+        XXX: This is currently experimental and may change without notice.
+        Moreover, it can be improved upon since storing the samples twice is not ideal.
+        One could instead store the indices in ``y_train`` that fall into each leaf,
+        which would lower RAM/diskspace usage.
+
     Attributes
     ----------
     feature_importances_ : ndarray of shape (n_features,)
@@ -1713,6 +1723,16 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the samples that fall into leaves in the ``tree_`` attribute.
+        Each leaf will store a 2D array corresponding to the samples that fall into it
+        keyed by node_id.
+
+        XXX: This is currently experimental and may change without notice.
+        Moreover, it can be improved upon since storing the samples twice is not ideal.
+        One could instead store the indices in ``y_train`` that fall into each leaf,
+        which would lower RAM/diskspace usage.
+
     Attributes
     ----------
     classes_ : ndarray of shape (n_classes,) or list of ndarray
@@ -1959,6 +1979,16 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the samples that fall into leaves in the ``tree_`` attribute.
+        Each leaf will store a 2D array corresponding to the samples that fall into it
+        keyed by node_id.
+
+        XXX: This is currently experimental and may change without notice.
+        Moreover, it can be improved upon since storing the samples twice is not ideal.
+        One could instead store the indices in ``y_train`` that fall into each leaf,
+        which would lower RAM/diskspace usage.
+
     Attributes
     ----------
     max_features_ : int

From 9655d013870e3007d5c5a1898212a9d0eeea0968 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 30 Jun 2023 13:03:26 -0700
Subject: [PATCH 28/39] Fix now

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 34bebab399566..768eeeaf1959f 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -63,13 +63,6 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from sklearn.exceptions import DataConversionWarning
 from sklearn.metrics import accuracy_score, r2_score
 from sklearn.preprocessing import OneHotEncoder
-from sklearn.tree import (
-    BaseDecisionTree,
-    DecisionTreeClassifier,
-    DecisionTreeRegressor,
-    ExtraTreeClassifier,
-    ExtraTreeRegressor,
-)
 from sklearn.utils import check_random_state, compute_sample_weight
 from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
 from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions
@@ -82,6 +75,13 @@ class calls the ``fit`` method of each sub-estimator on random samples
     check_is_fitted,
 )
 
+from ..tree import (
+    BaseDecisionTree,
+    DecisionTreeClassifier,
+    DecisionTreeRegressor,
+    ExtraTreeClassifier,
+    ExtraTreeRegressor,
+)
 from ..tree._tree import DOUBLE, DTYPE
 
 __all__ = [

From 6b57c5819782afec9ff5ac97e2c662bc8a66506d Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 5 Jul 2023 10:52:29 -0400
Subject: [PATCH 29/39] Bring in monotonicity (#47)

<!--
Thanks for contributing a pull request! Please ensure you have taken a
look at
the contribution guidelines:
https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md
-->

#### Reference Issues/PRs
<!--
Example: Fixes #1234. See also #3456.
Please use keywords (e.g., Fixes) to create link to the issues or pull
requests
you resolved, so that they will automatically be closed when your pull
request
is merged. See
https://github.com/blog/1506-closing-issues-via-pull-requests
-->


#### What does this implement/fix? Explain your changes.


#### Any other comments?


<!--
Please be aware that we are a loose team of volunteers so patience is
necessary; assistance handling other issues is very welcome. We value
all user contributions, no matter how minor they are. If we are slow to
review, either the pull request needs some benchmarking, tinkering,
convincing, etc. or more likely the reviewers are simply busy. In either
case, we ask for your understanding during the review process.
For more information, see our FAQ on this topic:

http://scikit-learn.org/dev/faq.html#why-is-my-pull-request-not-getting-any-attention.

Thanks for contributing!
-->

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 doc/whats_new/v1.4.rst                        |  32 ++
 sklearn/ensemble/_forest.py                   |  97 +++-
 sklearn/ensemble/_gb.py                       |   1 +
 sklearn/feature_selection/_mutual_info.py     |   5 +-
 .../tests/test_mutual_info.py                 |  15 +
 sklearn/pipeline.py                           |  20 +-
 sklearn/preprocessing/_data.py                |   5 +-
 sklearn/tree/_classes.py                      | 128 ++++-
 sklearn/tree/_criterion.pxd                   |  22 +
 sklearn/tree/_criterion.pyx                   | 125 +++++
 sklearn/tree/_splitter.pxd                    |  22 +-
 sklearn/tree/_splitter.pyx                    | 133 ++++-
 sklearn/tree/_tree.pyx                        | 192 ++++++-
 sklearn/tree/tests/test_monotonic_tree.py     | 491 ++++++++++++++++++
 sklearn/tree/tests/test_tree.py               |   2 +-
 15 files changed, 1231 insertions(+), 59 deletions(-)
 create mode 100644 sklearn/tree/tests/test_monotonic_tree.py

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index 54aacb3988e81..6a5660ee27b2e 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -59,6 +59,31 @@ TODO: update at the time of the release.
   passed to the ``fit`` method of the the estimator. :pr:`26506` by `Adrin
   Jalali`_.
 
+
+:mod:`sklearn.ensemble`
+.......................
+
+- |Feature| :class:`ensemble.RandomForestClassifier`,
+  :class:`ensemble.RandomForestRegressor`, :class:`ensemble.ExtraTreesClassifier`
+  and :class:`ensemble.ExtraTreesRegressor` now support monotonic constraints,
+  useful when features are supposed to have a positive/negative effect on the target.
+  Missing values in the train data and multi-output targets are not supported.
+  :pr:`13649` by :user:`Samuel Ronsin <samronsin>`,
+  initiated by :user:`Patrick O'Reilly <pat-oreilly>`.
+
+
+:mod:`sklearn.tree`
+...................
+
+- |Feature| :class:`tree.DecisionTreeClassifier`, :class:`tree.DecisionTreeRegressor`,
+  :class:`tree.ExtraTreeClassifier` and :class:`tree.ExtraTreeRegressor` now support
+  monotonic constraints, useful when features are supposed to have a positive/negative
+  effect on the target. Missing values in the train data and multi-output targets are
+  not supported.
+  :pr:`13649` by :user:`Samuel Ronsin <samronsin>`, initiated by
+  :user:`Patrick O'Reilly <pat-oreilly>`.
+
+
 :mod:`sklearn.decomposition`
 ............................
 
@@ -68,3 +93,10 @@ TODO: update at the time of the release.
   when using a custom initialization. The default value of this parameter will change
   from `None` to `auto` in version 1.6.
   :pr:`26634` by :user:`Alexandre Landeau <AlexL>` and :user:`Alexandre Vigny <avigny>`.
+
+
+:mod:`sklearn.feature_selection`
+................................
+
+- |Fix| :func:`feature_selection.mutual_info_regression` now correctly computes the
+  result when `X` is of integer dtype. :pr:`26748` by :user:`Yao Xiao <Charlie-XIAO>`.
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 768eeeaf1959f..47aa995ee51f3 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -1525,9 +1525,31 @@ class RandomForestClassifier(ForestClassifier):
 
         .. versionadded:: 0.22
 
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values.
+
     store_leaf_values : bool, default=False
         Whether to store the leaf values in the ``get_leaf_node_samples`` function.
 
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multiclass classifications (i.e. when `n_classes > 2`),
+          - multioutput classifications (i.e. when `n_outputs_ > 1`),
+          - classifications trained on data with missing values.
+
+        The constraints hold over the probability of the positive class.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 1.4
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier`
@@ -1670,6 +1692,7 @@ def __init__(
         max_samples=None,
         max_bins=None,
         store_leaf_values=False,
+        monotonic_cst=None,
     ):
         super().__init__(
             estimator=DecisionTreeClassifier(),
@@ -1686,6 +1709,7 @@ def __init__(
                 "random_state",
                 "ccp_alpha",
                 "store_leaf_values",
+                "monotonic_cst",
             ),
             bootstrap=bootstrap,
             oob_score=oob_score,
@@ -1707,6 +1731,7 @@ def __init__(
         self.max_features = max_features
         self.max_leaf_nodes = max_leaf_nodes
         self.min_impurity_decrease = min_impurity_decrease
+        self.monotonic_cst = monotonic_cst
         self.ccp_alpha = ccp_alpha
 
 
@@ -1887,9 +1912,29 @@ class RandomForestRegressor(ForestRegressor):
 
         .. versionadded:: 0.22
 
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values. Used for
+        speeding up training time.
+
     store_leaf_values : bool, default=False
         Whether to store the leaf values in the ``get_leaf_node_samples`` function.
 
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonically increasing
+          - 0: no constraint
+          - -1: monotonically decreasing
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multioutput regressions (i.e. when `n_outputs_ > 1`),
+          - regressions trained on data with missing values.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 1.4
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.DecisionTreeRegressor`
@@ -2019,6 +2064,7 @@ def __init__(
         max_samples=None,
         max_bins=None,
         store_leaf_values=False,
+        monotonic_cst=None,
     ):
         super().__init__(
             estimator=DecisionTreeRegressor(),
@@ -2035,6 +2081,7 @@ def __init__(
                 "random_state",
                 "ccp_alpha",
                 "store_leaf_values",
+                "monotonic_cst",
             ),
             bootstrap=bootstrap,
             oob_score=oob_score,
@@ -2056,6 +2103,7 @@ def __init__(
         self.max_leaf_nodes = max_leaf_nodes
         self.min_impurity_decrease = min_impurity_decrease
         self.ccp_alpha = ccp_alpha
+        self.monotonic_cst = monotonic_cst
 
 
 class ExtraTreesClassifier(ForestClassifier):
@@ -2242,10 +2290,32 @@ class ExtraTreesClassifier(ForestClassifier):
           `max_samples` should be in the interval `(0.0, 1.0]`.
 
         .. versionadded:: 0.22
+    
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values.
 
     store_leaf_values : bool, default=False
         Whether to store the leaf values in the ``get_leaf_node_samples`` function.
 
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonically increasing
+          - 0: no constraint
+          - -1: monotonically decreasing
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multiclass classifications (i.e. when `n_classes > 2`),
+          - multioutput classifications (i.e. when `n_outputs_ > 1`),
+          - classifications trained on data with missing values.
+
+        The constraints hold over the probability of the positive class.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 1.4
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.ExtraTreesClassifier`
@@ -2377,6 +2447,7 @@ def __init__(
         max_samples=None,
         max_bins=None,
         store_leaf_values=False,
+        monotonic_cst=None,
     ):
         super().__init__(
             estimator=ExtraTreeClassifier(),
@@ -2393,6 +2464,7 @@ def __init__(
                 "random_state",
                 "ccp_alpha",
                 "store_leaf_values",
+                "monotonic_cst",
             ),
             bootstrap=bootstrap,
             oob_score=oob_score,
@@ -2415,6 +2487,7 @@ def __init__(
         self.max_leaf_nodes = max_leaf_nodes
         self.min_impurity_decrease = min_impurity_decrease
         self.ccp_alpha = ccp_alpha
+        self.monotonic_cst = monotonic_cst
 
 
 class ExtraTreesRegressor(ForestRegressor):
@@ -2590,9 +2663,28 @@ class ExtraTreesRegressor(ForestRegressor):
 
         .. versionadded:: 0.22
 
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values.
+
     store_leaf_values : bool, default=False
         Whether to store the leaf values in the ``get_leaf_node_samples`` function.
 
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonically increasing
+          - 0: no constraint
+          - -1: monotonically decreasing
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multioutput regressions (i.e. when `n_outputs_ > 1`),
+          - regressions trained on data with missing values.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 1.4
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor`
@@ -2707,6 +2799,7 @@ def __init__(
         max_samples=None,
         max_bins=None,
         store_leaf_values=False,
+        monotonic_cst=None,
     ):
         super().__init__(
             estimator=ExtraTreeRegressor(),
@@ -2723,6 +2816,7 @@ def __init__(
                 "random_state",
                 "ccp_alpha",
                 "store_leaf_values",
+                "monotonic_cst",
             ),
             bootstrap=bootstrap,
             oob_score=oob_score,
@@ -2744,6 +2838,7 @@ def __init__(
         self.max_leaf_nodes = max_leaf_nodes
         self.min_impurity_decrease = min_impurity_decrease
         self.ccp_alpha = ccp_alpha
+        self.monotonic_cst = monotonic_cst
 
 
 class RandomTreesEmbedding(TransformerMixin, BaseForest):
@@ -2937,7 +3032,7 @@ class RandomTreesEmbedding(TransformerMixin, BaseForest):
         **BaseDecisionTree._parameter_constraints,
         "sparse_output": ["boolean"],
     }
-    for param in ("max_features", "ccp_alpha", "splitter"):
+    for param in ("max_features", "ccp_alpha", "splitter", "monotonic_cst"):
         _parameter_constraints.pop(param)
 
     criterion = "squared_error"
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index 21acb6bfe7693..3a14da52047ad 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -138,6 +138,7 @@ class BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta):
     }
     _parameter_constraints.pop("store_leaf_values")
     _parameter_constraints.pop("splitter")
+    _parameter_constraints.pop("monotonic_cst")
 
     @abstractmethod
     def __init__(
diff --git a/sklearn/feature_selection/_mutual_info.py b/sklearn/feature_selection/_mutual_info.py
index 78425ad6299d5..bd62495ac28a3 100644
--- a/sklearn/feature_selection/_mutual_info.py
+++ b/sklearn/feature_selection/_mutual_info.py
@@ -280,15 +280,12 @@ def _estimate_mi(
 
     rng = check_random_state(random_state)
     if np.any(continuous_mask):
-        if copy:
-            X = X.copy()
-
+        X = X.astype(np.float64, copy=copy)
         X[:, continuous_mask] = scale(
             X[:, continuous_mask], with_mean=False, copy=False
         )
 
         # Add small noise to continuous features as advised in Kraskov et. al.
-        X = X.astype(np.float64, copy=False)
         means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))
         X[:, continuous_mask] += (
             1e-10
diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py
index f7b4af0a393f9..349147f66e36c 100644
--- a/sklearn/feature_selection/tests/test_mutual_info.py
+++ b/sklearn/feature_selection/tests/test_mutual_info.py
@@ -236,3 +236,18 @@ def test_mutual_information_symmetry_classif_regression(correlated, global_rando
     )
 
     assert mi_classif == pytest.approx(mi_regression)
+
+
+def test_mutual_info_regression_X_int_dtype(global_random_seed):
+    """Check that results agree when X is integer dtype and float dtype.
+
+    Non-regression test for Issue #26696.
+    """
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.randint(100, size=(100, 10))
+    X_float = X.astype(np.float64, copy=True)
+    y = rng.randint(100, size=100)
+
+    expected = mutual_info_regression(X_float, y, random_state=global_random_seed)
+    result = mutual_info_regression(X, y, random_state=global_random_seed)
+    assert_allclose(result, expected)
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 6fffd93c2a64c..26008c82fef11 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -131,10 +131,11 @@ class Pipeline(_BaseComposition):
     >>> pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])
     >>> # The pipeline can be used as any other estimator
     >>> # and avoids leaking the test set into the train set
-    >>> pipe.fit(X_train, y_train)
-    Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])
-    >>> pipe.score(X_test, y_test)
+    >>> pipe.fit(X_train, y_train).score(X_test, y_test)
     0.88
+    >>> # An estimator's parameter can be set using '__' syntax
+    >>> pipe.set_params(svc__C=10).fit(X_train, y_train).score(X_test, y_test)
+    0.76
     """
 
     # BaseEstimator interface
@@ -1051,6 +1052,10 @@ class FeatureUnion(TransformerMixin, _BaseComposition):
     >>> union.fit_transform(X)
     array([[ 1.5       ,  3.0...,  0.8...],
            [-1.5       ,  5.7..., -0.4...]])
+    >>> # An estimator's parameter can be set using '__' syntax
+    >>> union.set_params(pca__n_components=1).fit_transform(X)
+    array([[ 1.5       ,  3.0...],
+           [-1.5       ,  5.7...]])
     """
 
     _required_parameters = ["transformer_list"]
@@ -1362,11 +1367,12 @@ def __getitem__(self, name):
 
 
 def make_union(*transformers, n_jobs=None, verbose=False):
-    """Construct a FeatureUnion from the given transformers.
+    """Construct a :class:`FeatureUnion` from the given transformers.
 
-    This is a shorthand for the FeatureUnion constructor; it does not require,
-    and does not permit, naming the transformers. Instead, they will be given
-    names automatically based on their types. It also does not allow weighting.
+    This is a shorthand for the :class:`FeatureUnion` constructor; it does not
+    require, and does not permit, naming the transformers. Instead, they will
+    be given names automatically based on their types. It also does not allow
+    weighting.
 
     Parameters
     ----------
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 56ae17f312fa2..197d709689daa 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -721,11 +721,12 @@ class StandardScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
 
     mean_ : ndarray of shape (n_features,) or None
         The mean value for each feature in the training set.
-        Equal to ``None`` when ``with_mean=False``.
+        Equal to ``None`` when ``with_mean=False`` and ``with_std=False``.
 
     var_ : ndarray of shape (n_features,) or None
         The variance for each feature in the training set. Used to compute
-        `scale_`. Equal to ``None`` when ``with_std=False``.
+        `scale_`. Equal to ``None`` when ``with_mean=False`` and
+        ``with_std=False``.
 
     n_features_in_ : int
         Number of features seen during :term:`fit`.
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 200f87b0b9ef3..1b718f3a04052 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -124,6 +124,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
         "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")],
         "ccp_alpha": [Interval(Real, 0.0, None, closed="left")],
         "store_leaf_values": ["boolean"],
+        "monotonic_cst": ["array-like", None],
     }
 
     @abstractmethod
@@ -143,6 +144,7 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         store_leaf_values=False,
+        monotonic_cst=None,
     ):
         self.criterion = criterion
         self.splitter = splitter
@@ -157,6 +159,7 @@ def __init__(
         self.class_weight = class_weight
         self.ccp_alpha = ccp_alpha
         self.store_leaf_values = store_leaf_values
+        self.monotonic_cst = monotonic_cst
 
     def get_depth(self):
         """Return the depth of the decision tree.
@@ -184,7 +187,11 @@ def get_n_leaves(self):
         return self.tree_.n_leaves
 
     def _support_missing_values(self, X):
-        return not issparse(X) and self._get_tags()["allow_nan"]
+        return (
+            not issparse(X)
+            and self._get_tags()["allow_nan"]
+            and self.monotonic_cst is None
+        )
 
     def _compute_missing_values_in_feature_mask(self, X):
         """Return boolean mask denoting if there are missing values for each feature.
@@ -469,7 +476,45 @@ def _build_tree(
 
         SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
 
-        splitter = self.splitter
+        if self.monotonic_cst is None:
+            monotonic_cst = None
+        else:
+            if self.n_outputs_ > 1:
+                raise ValueError(
+                    "Monotonicity constraints are not supported with multiple outputs."
+                )
+            # Check to correct monotonicity constraint' specification,
+            # by applying element-wise logical conjunction
+            # Note: we do not cast `np.asarray(self.monotonic_cst, dtype=np.int8)`
+            # straight away here so as to generate error messages for invalid
+            # values using the original values prior to any dtype related conversion.
+            monotonic_cst = np.asarray(self.monotonic_cst)
+            if monotonic_cst.shape[0] != X.shape[1]:
+                raise ValueError(
+                    "monotonic_cst has shape {} but the input data "
+                    "X has {} features.".format(monotonic_cst.shape[0], X.shape[1])
+                )
+            valid_constraints = np.isin(monotonic_cst, (-1, 0, 1))
+            if not np.all(valid_constraints):
+                unique_constaints_value = np.unique(monotonic_cst)
+                raise ValueError(
+                    "monotonic_cst must be None or an array-like of -1, 0 or 1, but"
+                    f" got {unique_constaints_value}"
+                )
+            monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
+            if is_classifier(self):
+                if self.n_classes_[0] > 2:
+                    raise ValueError(
+                        "Monotonicity constraints are not supported with multiclass "
+                        "classification"
+                    )
+                # Binary classification trees are built by constraining probabilities
+                # of the *negative class* in order to make the implementation similar
+                # to regression trees.
+                # Since self.monotonic_cst encodes constraints on probabilities of the
+                # *positive class*, all signs must be flipped.
+                monotonic_cst *= -1
+
         if not isinstance(self.splitter, BaseSplitter):
             splitter = SPLITTERS[self.splitter](
                 criterion,
@@ -477,6 +522,7 @@ def _build_tree(
                 min_samples_leaf,
                 min_weight_leaf,
                 random_state,
+                monotonic_cst,
             )
 
         if is_classifier(self):
@@ -1003,6 +1049,25 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
         One could instead store the indices in ``y_train`` that fall into each leaf,
         which would lower RAM/diskspace usage.
 
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multiclass classifications (i.e. when `n_classes > 2`),
+          - multioutput classifications (i.e. when `n_outputs_ > 1`),
+          - classifications trained on data with missing values.
+
+        The constraints hold over the probability of the positive class.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 1.4
+
     Attributes
     ----------
     classes_ : ndarray of shape (n_classes,) or list of ndarray
@@ -1121,6 +1186,7 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         store_leaf_values=False,
+        monotonic_cst=None,
     ):
         super().__init__(
             criterion=criterion,
@@ -1134,6 +1200,7 @@ def __init__(
             class_weight=class_weight,
             random_state=random_state,
             min_impurity_decrease=min_impurity_decrease,
+            monotonic_cst=monotonic_cst,
             ccp_alpha=ccp_alpha,
             store_leaf_values=store_leaf_values,
         )
@@ -1396,6 +1463,22 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
         One could instead store the indices in ``y_train`` that fall into each leaf,
         which would lower RAM/diskspace usage.
 
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multioutput regressions (i.e. when `n_outputs_ > 1`),
+          - regressions trained on data with missing values.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 1.4
+
     Attributes
     ----------
     feature_importances_ : ndarray of shape (n_features,)
@@ -1495,6 +1578,7 @@ def __init__(
         min_impurity_decrease=0.0,
         ccp_alpha=0.0,
         store_leaf_values=False,
+        monotonic_cst=None,
     ):
         super().__init__(
             criterion=criterion,
@@ -1509,6 +1593,7 @@ def __init__(
             min_impurity_decrease=min_impurity_decrease,
             ccp_alpha=ccp_alpha,
             store_leaf_values=store_leaf_values,
+            monotonic_cst=monotonic_cst,
         )
 
     @_fit_context(prefer_skip_nested_validation=True)
@@ -1733,6 +1818,25 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
         One could instead store the indices in ``y_train`` that fall into each leaf,
         which would lower RAM/diskspace usage.
 
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multiclass classifications (i.e. when `n_classes > 2`),
+          - multioutput classifications (i.e. when `n_outputs_ > 1`),
+          - classifications trained on data with missing values.
+
+        The constraints hold over the probability of the positive class.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 1.4
+
     Attributes
     ----------
     classes_ : ndarray of shape (n_classes,) or list of ndarray
@@ -1834,6 +1938,7 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         store_leaf_values=False,
+        monotonic_cst=None,
     ):
         super().__init__(
             criterion=criterion,
@@ -1849,6 +1954,7 @@ def __init__(
             random_state=random_state,
             ccp_alpha=ccp_alpha,
             store_leaf_values=store_leaf_values,
+            monotonic_cst=monotonic_cst,
         )
 
 
@@ -1989,6 +2095,22 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
         One could instead store the indices in ``y_train`` that fall into each leaf,
         which would lower RAM/diskspace usage.
 
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multioutput regressions (i.e. when `n_outputs_ > 1`),
+          - regressions trained on data with missing values.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 1.4
+
     Attributes
     ----------
     max_features_ : int
@@ -2073,6 +2195,7 @@ def __init__(
         max_leaf_nodes=None,
         ccp_alpha=0.0,
         store_leaf_values=False,
+        monotonic_cst=None,
     ):
         super().__init__(
             criterion=criterion,
@@ -2087,4 +2210,5 @@ def __init__(
             random_state=random_state,
             ccp_alpha=ccp_alpha,
             store_leaf_values=store_leaf_values,
+            monotonic_cst=monotonic_cst,
         )
diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index ecbf56e5f6016..f972cf2afc932 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -10,6 +10,7 @@
 # License: BSD 3 clause
 
 # See _criterion.pyx for implementation details.
+cimport numpy as cnp
 
 from libcpp.vector cimport vector
 
@@ -58,6 +59,13 @@ cdef class BaseCriterion:
         self,
         double* dest
     ) noexcept nogil
+    cdef void clip_node_value(
+        self,
+        double* dest,
+        double lower_bound,
+        double upper_bound
+    ) noexcept nogil
+    cdef double middle_value(self) noexcept nogil
     cdef double impurity_improvement(
         self,
         double impurity_parent,
@@ -65,6 +73,20 @@ cdef class BaseCriterion:
         double impurity_right
     ) noexcept nogil
     cdef double proxy_impurity_improvement(self) noexcept nogil
+    cdef bint check_monotonicity(
+            self,
+            cnp.int8_t monotonic_cst,
+            double lower_bound,
+            double upper_bound,
+    ) noexcept nogil
+    cdef inline bint _check_monotonicity(
+            self,
+            cnp.int8_t monotonic_cst,
+            double lower_bound,
+            double upper_bound,
+            double sum_left,
+            double sum_right,
+    ) noexcept nogil
 
     cdef void set_sample_pointers(
         self,
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index bd1bdef0a6a93..41ead9fdb70e2 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -130,6 +130,34 @@ cdef class BaseCriterion:
         """
         pass
 
+    cdef void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil:
+        pass
+
+    cdef double middle_value(self) noexcept nogil:
+        """Compute the middle value of a split for monotonicity constraints
+
+        This method is implemented in ClassificationCriterion and RegressionCriterion.
+        """
+        pass
+
+    cdef bint check_monotonicity(
+        self,
+        cnp.int8_t monotonic_cst,
+        double lower_bound,
+        double upper_bound,
+    ) noexcept nogil:
+        pass
+
+    cdef inline bint _check_monotonicity(
+        self,
+        cnp.int8_t monotonic_cst,
+        double lower_bound,
+        double upper_bound,
+        double value_left,
+        double value_right,
+    ) noexcept nogil:
+        pass
+
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
 
@@ -608,6 +636,47 @@ cdef class ClassificationCriterion(Criterion):
             memcpy(dest, &self.sum_total[k, 0], self.n_classes[k] * sizeof(double))
             dest += self.max_n_classes
 
+    cdef void clip_node_value(self, double * dest, double lower_bound, double upper_bound) noexcept nogil:
+        """Clip the value in dest between lower_bound and upper_bound for monotonic constraints.
+
+        Note that monotonicity constraints are only supported for:
+        - single-output trees and
+        - binary classifications.
+        """
+        if dest[0] < lower_bound:
+            dest[0] = lower_bound
+        elif dest[0] > upper_bound:
+            dest[0] = upper_bound
+
+        # Class proportions for binary classification must sum to 1.
+        dest[1] = 1 - dest[0]
+
+    cdef inline double middle_value(self) noexcept nogil:
+        """Compute the middle value of a split for monotonicity constraints as the simple average
+        of the left and right children values.
+
+        Note that monotonicity constraints are only supported for:
+        - single-output trees and
+        - binary classifications.
+        """
+        return (
+            (self.sum_left[0, 0] / (2 * self.weighted_n_left)) +
+            (self.sum_right[0, 0] / (2 * self.weighted_n_right))
+        )
+
+    cdef inline bint check_monotonicity(
+        self,
+        cnp.int8_t monotonic_cst,
+        double lower_bound,
+        double upper_bound,
+    ) noexcept nogil:
+        """Check monotonicity constraint is satisfied at the current classification split"""
+        cdef:
+            double value_left = self.sum_left[0][0] / self.weighted_n_left
+            double value_right = self.sum_right[0][0] / self.weighted_n_right
+
+        return self._check_monotonicity(monotonic_cst, lower_bound, upper_bound, value_left, value_right)
+
 
 cdef class Entropy(ClassificationCriterion):
     r"""Cross Entropy impurity criterion.
@@ -1033,6 +1102,37 @@ cdef class RegressionCriterion(Criterion):
         for k in range(self.n_outputs):
             dest[k] = self.sum_total[k] / self.weighted_n_node_samples
 
+    cdef inline void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil:
+        """Clip the value in dest between lower_bound and upper_bound for monotonic constraints."""
+        if dest[0] < lower_bound:
+            dest[0] = lower_bound
+        elif dest[0] > upper_bound:
+            dest[0] = upper_bound
+
+    cdef double middle_value(self) noexcept nogil:
+        """Compute the middle value of a split for monotonicity constraints as the simple average
+        of the left and right children values.
+
+        Monotonicity constraints are only supported for single-output trees we can safely assume
+        n_outputs == 1.
+        """
+        return (
+            (self.sum_left[0] / (2 * self.weighted_n_left)) +
+            (self.sum_right[0] / (2 * self.weighted_n_right))
+        )
+
+    cdef bint check_monotonicity(
+        self,
+        cnp.int8_t monotonic_cst,
+        double lower_bound,
+        double upper_bound,
+    ) noexcept nogil:
+        """Check monotonicity constraint is satisfied at the current regression split"""
+        cdef:
+            double value_left = self.sum_left[0] / self.weighted_n_left
+            double value_right = self.sum_right[0] / self.weighted_n_right
+
+        return self._check_monotonicity(monotonic_cst, lower_bound, upper_bound, value_left, value_right)
 
 cdef class MSE(RegressionCriterion):
     """Mean squared error impurity criterion.
@@ -1365,6 +1465,31 @@ cdef class MAE(RegressionCriterion):
         for k in range(self.n_outputs):
             dest[k] = <double> self.node_medians[k]
 
+    cdef inline double middle_value(self) noexcept nogil:
+        """Compute the middle value of a split for monotonicity constraints as the simple average
+        of the left and right children values.
+
+        Monotonicity constraints are only supported for single-output trees we can safely assume
+        n_outputs == 1.
+        """
+        return (
+                (<WeightedMedianCalculator> self.left_child_ptr[0]).get_median() +
+                (<WeightedMedianCalculator> self.right_child_ptr[0]).get_median()
+        ) / 2
+
+    cdef inline bint check_monotonicity(
+        self,
+        cnp.int8_t monotonic_cst,
+        double lower_bound,
+        double upper_bound,
+    ) noexcept nogil:
+        """Check monotonicity constraint is satisfied at the current regression split"""
+        cdef:
+            double value_left = (<WeightedMedianCalculator> self.left_child_ptr[0]).get_median()
+            double value_right = (<WeightedMedianCalculator> self.right_child_ptr[0]).get_median()
+
+        return self._check_monotonicity(monotonic_cst, lower_bound, upper_bound, value_left, value_right)
+
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
 
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index a6515338c492d..3d8e4fd7510d7 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -10,6 +10,7 @@
 # License: BSD 3 clause
 
 # See _splitter.pyx for details.
+cimport numpy as cnp
 
 from libcpp.vector cimport vector
 
@@ -31,6 +32,8 @@ cdef struct SplitRecord:
     double improvement     # Impurity improvement given parent node.
     double impurity_left   # Impurity of the left split.
     double impurity_right  # Impurity of the right split.
+    double lower_bound     # Lower bound on value of both children for monotonicity
+    double upper_bound     # Upper bound on value of both children for monotonicity
     unsigned char missing_go_to_left  # Controls if missing values go to the left node.
     SIZE_t n_missing       # Number of missing values for the feature being split on
 
@@ -61,6 +64,14 @@ cdef class BaseSplitter:
     cdef SIZE_t start                    # Start position for the current node
     cdef SIZE_t end                      # End position for the current node
 
+    # Monotonicity constraints for each feature.
+    # The encoding is as follows:
+    #   -1: monotonic decrease
+    #    0: no constraint
+    #   +1: monotonic increase
+    cdef const cnp.int8_t[:] monotonic_cst
+    cdef bint with_monotonic_cst
+
     cdef const DOUBLE_t[:] sample_weight
 
     # The samples vector `samples` is maintained by the Splitter object such
@@ -90,9 +101,17 @@ cdef class BaseSplitter:
         self,
         double impurity,   # Impurity of the node
         SplitRecord* split,
-        SIZE_t* n_constant_features
+        SIZE_t* n_constant_features,
+        double lower_bound,
+        double upper_bound,
     ) except -1 nogil
     cdef void node_value(self, double* dest) noexcept nogil
+    cdef void clip_node_value(
+        self,
+        double* dest,
+        double lower_bound,
+        double upper_bound
+    ) noexcept nogil
     cdef double node_impurity(self) noexcept nogil
     cdef int pointer_size(self) noexcept nogil
 
@@ -117,6 +136,7 @@ cdef class Splitter(BaseSplitter):
         SIZE_t n_missing,
         bint missing_go_to_left,
     ) noexcept nogil
+
     cdef bint check_postsplit_conditions(
         self
     ) noexcept nogil
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index bca38d5f04374..a0fc6a7088e5c 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -21,6 +21,7 @@ from cython cimport final
 from libc.math cimport isnan
 from libc.stdlib cimport qsort
 from libc.string cimport memcpy
+cimport numpy as cnp
 
 from ._criterion cimport Criterion
 
@@ -88,8 +89,14 @@ cdef class BaseSplitter:
         """
         pass
 
-    cdef int node_split(self, double impurity, SplitRecord* split,
-                        SIZE_t* n_constant_features) except -1 nogil:
+    cdef int node_split(
+        self,
+        double impurity,
+        SplitRecord* split,
+        SIZE_t* n_constant_features,
+        double lower_bound,
+        double upper_bound
+    ) except -1 nogil:
         """Find the best split on node samples[start:end].
 
         This is a placeholder method. The majority of computation will be done
@@ -103,6 +110,10 @@ cdef class BaseSplitter:
         """Copy the value of node samples[start:end] into dest."""
         pass
 
+    cdef inline void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil:
+        """Clip the value of node samples[start:end] into dest."""
+        pass
+
     cdef double node_impurity(self) noexcept nogil:
         """Return the impurity of the current node."""
         pass
@@ -118,9 +129,16 @@ cdef class BaseSplitter:
 cdef class Splitter(BaseSplitter):
     """Abstract interface for supervised splitters."""
 
-    def __cinit__(self, Criterion criterion, SIZE_t max_features,
-                  SIZE_t min_samples_leaf, double min_weight_leaf,
-                  object random_state, *argv):
+    def __cinit__(
+        self,
+        Criterion criterion,
+        SIZE_t max_features,
+        SIZE_t min_samples_leaf,
+        double min_weight_leaf,
+        object random_state,
+        const cnp.int8_t[:] monotonic_cst,
+        *argv
+    ):
         """
         Parameters
         ----------
@@ -142,6 +160,10 @@ cdef class Splitter(BaseSplitter):
 
         random_state : object
             The user inputted random state to be used for pseudo-randomness
+
+        monotonic_cst : const cnp.int8_t[:]
+            Monotonicity constraints
+
         """
         self.criterion = criterion
 
@@ -152,13 +174,16 @@ cdef class Splitter(BaseSplitter):
         self.min_samples_leaf = min_samples_leaf
         self.min_weight_leaf = min_weight_leaf
         self.random_state = random_state
+        self.monotonic_cst = monotonic_cst
+        self.with_monotonic_cst = monotonic_cst is not None
 
     def __reduce__(self):
         return (type(self), (self.criterion,
                              self.max_features,
                              self.min_samples_leaf,
                              self.min_weight_leaf,
-                             self.random_state), self.__getstate__())
+                             self.random_state,
+                             self.monotonic_cst), self.__getstate__())
 
     cdef int init(
         self,
@@ -275,6 +300,11 @@ cdef class Splitter(BaseSplitter):
         """Copy the value of node samples[start:end] into dest."""
 
         self.criterion.node_value(dest)
+    
+    cdef inline void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil:
+        """Clip the value in dest between lower_bound and upper_bound for monotonic constraints."""
+
+        self.criterion.clip_node_value(dest, lower_bound, upper_bound)
 
     cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil:
         """Copy the samples[start:end] into dest."""
@@ -367,6 +397,10 @@ cdef inline int node_split_best(
     double impurity,
     SplitRecord* split,
     SIZE_t* n_constant_features,
+    bint with_monotonic_cst,
+    const cnp.int8_t[:] monotonic_cst,
+    double lower_bound,
+    double upper_bound,
 ) except -1 nogil:
     """Find the best split on node samples[start:end]
 
@@ -506,6 +540,18 @@ cdef inline int node_split_best(
 
                 current_split.pos = p
 
+                # Reject if monotonicity constraints are not satisfied
+                if (
+                    with_monotonic_cst and
+                    monotonic_cst[current_split.feature] != 0 and
+                    not criterion.check_monotonicity(
+                        monotonic_cst[current_split.feature],
+                        lower_bound,
+                        upper_bound,
+                    )
+                ):
+                    continue
+
                 # Reject if min_samples_leaf is not guaranteed
                 if missing_go_to_left:
                     n_left = current_split.pos - splitter.start + n_missing
@@ -729,7 +775,11 @@ cdef inline int node_split_random(
     Criterion criterion,
     double impurity,
     SplitRecord* split,
-    SIZE_t* n_constant_features
+    SIZE_t* n_constant_features,
+    bint with_monotonic_cst,
+    const cnp.int8_t[:] monotonic_cst,
+    double lower_bound,
+    double upper_bound,
 ) except -1 nogil:
     """Find the best random split on node samples[start:end]
 
@@ -853,6 +903,18 @@ cdef inline int node_split_random(
         if splitter.check_postsplit_conditions() == 1:
             continue
 
+        # Reject if monotonicity constraints are not satisfied
+        if (
+            with_monotonic_cst and
+            monotonic_cst[current_split.feature] != 0 and
+            not criterion.check_monotonicity(
+                monotonic_cst[current_split.feature],
+                lower_bound,
+                upper_bound,
+            )
+        ):
+            continue
+
         current_proxy_improvement = criterion.proxy_impurity_improvement()
 
         if current_proxy_improvement > best_proxy_improvement:
@@ -1538,8 +1600,14 @@ cdef class BestSplitter(Splitter):
             X, self.samples, self.feature_values, missing_values_in_feature_mask
         )
 
-    cdef int node_split(self, double impurity, SplitRecord* split,
-                        SIZE_t* n_constant_features) except -1 nogil:
+    cdef int node_split(
+        self,
+        double impurity,
+        SplitRecord* split,
+        SIZE_t* n_constant_features,
+        double lower_bound,
+        double upper_bound
+    ) except -1 nogil:
         return node_split_best(
             self,
             self.partitioner,
@@ -1547,6 +1615,10 @@ cdef class BestSplitter(Splitter):
             impurity,
             split,
             n_constant_features,
+            self.with_monotonic_cst,
+            self.monotonic_cst,
+            lower_bound,
+            upper_bound
         )
 
 cdef class BestSparseSplitter(Splitter):
@@ -1564,8 +1636,14 @@ cdef class BestSparseSplitter(Splitter):
             X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask
         )
 
-    cdef int node_split(self, double impurity, SplitRecord* split,
-                        SIZE_t* n_constant_features) except -1 nogil:
+    cdef int node_split(
+            self,
+            double impurity,
+            SplitRecord* split,
+            SIZE_t* n_constant_features,
+            double lower_bound,
+            double upper_bound
+    ) except -1 nogil:
         return node_split_best(
             self,
             self.partitioner,
@@ -1573,6 +1651,10 @@ cdef class BestSparseSplitter(Splitter):
             impurity,
             split,
             n_constant_features,
+            self.with_monotonic_cst,
+            self.monotonic_cst,
+            lower_bound,
+            upper_bound
         )
 
 cdef class RandomSplitter(Splitter):
@@ -1590,8 +1672,14 @@ cdef class RandomSplitter(Splitter):
             X, self.samples, self.feature_values, missing_values_in_feature_mask
         )
 
-    cdef int node_split(self, double impurity, SplitRecord* split,
-                        SIZE_t* n_constant_features) except -1 nogil:
+    cdef int node_split(
+            self,
+            double impurity,
+            SplitRecord* split,
+            SIZE_t* n_constant_features,
+            double lower_bound,
+            double upper_bound
+    ) except -1 nogil:
         return node_split_random(
             self,
             self.partitioner,
@@ -1599,6 +1687,10 @@ cdef class RandomSplitter(Splitter):
             impurity,
             split,
             n_constant_features,
+            self.with_monotonic_cst,
+            self.monotonic_cst,
+            lower_bound,
+            upper_bound
         )
 
 cdef class RandomSparseSplitter(Splitter):
@@ -1615,9 +1707,14 @@ cdef class RandomSparseSplitter(Splitter):
         self.partitioner = SparsePartitioner(
             X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask
         )
-
-    cdef int node_split(self, double impurity, SplitRecord* split,
-                        SIZE_t* n_constant_features) except -1 nogil:
+    cdef int node_split(
+            self,
+            double impurity,
+            SplitRecord* split,
+            SIZE_t* n_constant_features,
+            double lower_bound,
+            double upper_bound
+    ) except -1 nogil:
         return node_split_random(
             self,
             self.partitioner,
@@ -1625,4 +1722,8 @@ cdef class RandomSparseSplitter(Splitter):
             impurity,
             split,
             n_constant_features,
+            self.with_monotonic_cst,
+            self.monotonic_cst,
+            lower_bound,
+            upper_bound
         )
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index c44022f54d3a5..8dc4e94aa0732 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -149,6 +149,8 @@ cdef struct StackRecord:
     bint is_left
     double impurity
     SIZE_t n_constant_features
+    double lower_bound
+    double upper_bound
 
 cdef class DepthFirstTreeBuilder(TreeBuilder):
     """Build a decision tree in depth-first fashion."""
@@ -218,6 +220,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef SplitRecord* split_ptr = <SplitRecord *>malloc(splitter.pointer_size())
 
         cdef double impurity = INFINITY
+        cdef double lower_bound
+        cdef double upper_bound
+        cdef double middle_value
         cdef SIZE_t n_constant_features
         cdef bint is_leaf
         cdef bint first = 1
@@ -236,7 +241,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 "parent": _TREE_UNDEFINED,
                 "is_left": 0,
                 "impurity": INFINITY,
-                "n_constant_features": 0})
+                "n_constant_features": 0,
+                "lower_bound": -INFINITY,
+                "upper_bound": INFINITY,
+            })
 
             while not builder_stack.empty():
                 stack_record = builder_stack.top()
@@ -249,6 +257,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 is_left = stack_record.is_left
                 impurity = stack_record.impurity
                 n_constant_features = stack_record.n_constant_features
+                lower_bound = stack_record.lower_bound
+                upper_bound = stack_record.upper_bound
 
                 n_node_samples = end - start
                 splitter.node_reset(start, end, &weighted_n_node_samples)
@@ -266,7 +276,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 is_leaf = is_leaf or impurity <= EPSILON
 
                 if not is_leaf:
-                    splitter.node_split(impurity, split_ptr, &n_constant_features)
+                    splitter.node_split(
+                        impurity,
+                        split_ptr,
+                        &n_constant_features,
+                        lower_bound,
+                        upper_bound
+                    )
 
                     # assign local copy of SplitRecord to assign
                     # pos, improvement, and impurity scores
@@ -290,8 +306,42 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 # Store value for all nodes, to facilitate tree/model
                 # inspection and interpretation
                 splitter.node_value(tree.value + node_id * tree.value_stride)
+                if splitter.with_monotonic_cst:
+                    splitter.clip_node_value(tree.value + node_id * tree.value_stride, lower_bound, upper_bound)
 
                 if not is_leaf:
+                    if (
+                        not splitter.with_monotonic_cst or
+                        splitter.monotonic_cst[split.feature] == 0
+                    ):
+                        # Split on a feature with no monotonicity constraint
+
+                        # Current bounds must always be propagated to both children.
+                        # If a monotonic constraint is active, bounds are used in
+                        # node value clipping.
+                        left_child_min = right_child_min = lower_bound
+                        left_child_max = right_child_max = upper_bound
+                    elif splitter.monotonic_cst[split.feature] == 1:
+                        # Split on a feature with monotonic increase constraint
+                        left_child_min = lower_bound
+                        right_child_max = upper_bound
+
+                        # Lower bound for right child and upper bound for left child
+                        # are set to the same value.
+                        middle_value = splitter.criterion.middle_value()
+                        right_child_min = middle_value
+                        left_child_max = middle_value
+                    else:  # i.e. splitter.monotonic_cst[split.feature] == -1
+                        # Split on a feature with monotonic decrease constraint
+                        right_child_min = lower_bound
+                        left_child_max = upper_bound
+
+                        # Lower bound for left child and upper bound for right child
+                        # are set to the same value.
+                        middle_value = splitter.criterion.middle_value()
+                        left_child_min = middle_value
+                        right_child_max = middle_value
+
                     # Push right child on stack
                     builder_stack.push({
                         "start": split.pos,
@@ -300,7 +350,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         "parent": node_id,
                         "is_left": 0,
                         "impurity": split.impurity_right,
-                        "n_constant_features": n_constant_features})
+                        "n_constant_features": n_constant_features,
+                        "lower_bound": right_child_min,
+                        "upper_bound": right_child_max,
+                    })
 
                     # Push left child on stack
                     builder_stack.push({
@@ -310,7 +363,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         "parent": node_id,
                         "is_left": 1,
                         "impurity": split.impurity_left,
-                        "n_constant_features": n_constant_features})
+                        "n_constant_features": n_constant_features,
+                        "n_constant_features": n_constant_features,
+                        "lower_bound": left_child_min,
+                        "upper_bound": left_child_max,
+                    })
                 elif self.store_leaf_values and is_leaf:
                     # copy leaf values to leaf_values array
                     splitter.node_samples(tree.value_samples[node_id])
@@ -346,6 +403,9 @@ cdef struct FrontierRecord:
     double impurity_left
     double impurity_right
     double improvement
+    double lower_bound
+    double upper_bound
+    double middle_value
 
 cdef inline bool _compare_records(
     const FrontierRecord& left,
@@ -414,6 +474,10 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         cdef FrontierRecord record
         cdef FrontierRecord split_node_left
         cdef FrontierRecord split_node_right
+        cdef double left_child_min
+        cdef double left_child_max
+        cdef double right_child_min
+        cdef double right_child_max
 
         cdef SIZE_t n_node_samples = splitter.n_samples
         cdef SIZE_t max_split_nodes = max_leaf_nodes - 1
@@ -428,9 +492,20 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
 
         with nogil:
             # add root to frontier
-            rc = self._add_split_node(splitter, tree, 0, n_node_samples,
-                                      INFINITY, IS_FIRST, IS_LEFT, NULL, 0,
-                                      &split_node_left)
+            rc = self._add_split_node(
+                splitter=splitter,
+                tree=tree,
+                start=0,
+                end=n_node_samples,
+                impurity=INFINITY,
+                is_first=IS_FIRST,
+                is_left=IS_LEFT,
+                parent=NULL,
+                depth=0,
+                lower_bound=-INFINITY,
+                upper_bound=INFINITY,
+                res=&split_node_left,
+            )
             if rc >= 0:
                 _add_to_frontier(split_node_left, frontier)
 
@@ -455,16 +530,54 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                 else:
                     # Node is expandable
 
+                    if (
+                        not splitter.with_monotonic_cst or
+                        splitter.monotonic_cst[node.feature] == 0
+                    ):
+                        # Split on a feature with no monotonicity constraint
+
+                        # Current bounds must always be propagated to both children.
+                        # If a monotonic constraint is active, bounds are used in
+                        # node value clipping.
+                        left_child_min = right_child_min = record.lower_bound
+                        left_child_max = right_child_max = record.upper_bound
+                    elif splitter.monotonic_cst[node.feature] == 1:
+                        # Split on a feature with monotonic increase constraint
+                        left_child_min = record.lower_bound
+                        right_child_max = record.upper_bound
+
+                        # Lower bound for right child and upper bound for left child
+                        # are set to the same value.
+                        right_child_min = record.middle_value
+                        left_child_max = record.middle_value
+                    else:  # i.e. splitter.monotonic_cst[split.feature] == -1
+                        # Split on a feature with monotonic decrease constraint
+                        right_child_min = record.lower_bound
+                        left_child_max = record.upper_bound
+
+                        # Lower bound for left child and upper bound for right child
+                        # are set to the same value.
+                        left_child_min = record.middle_value
+                        right_child_max = record.middle_value
+
                     # Decrement number of split nodes available
                     max_split_nodes -= 1
 
                     # Compute left split node
-                    rc = self._add_split_node(splitter, tree,
-                                              record.start, record.pos,
-                                              record.impurity_left,
-                                              IS_NOT_FIRST, IS_LEFT, node,
-                                              record.depth + 1,
-                                              &split_node_left)
+                    rc = self._add_split_node(
+                        splitter=splitter,
+                        tree=tree,
+                        start=record.start,
+                        end=record.pos,
+                        impurity=record.impurity_left,
+                        is_first=IS_NOT_FIRST,
+                        is_left=IS_LEFT,
+                        parent=node,
+                        depth=record.depth + 1,
+                        lower_bound=left_child_min,
+                        upper_bound=left_child_max,
+                        res=&split_node_left,
+                    )
                     if rc == -1:
                         break
 
@@ -472,12 +585,20 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                     node = &tree.nodes[record.node_id]
 
                     # Compute right split node
-                    rc = self._add_split_node(splitter, tree, record.pos,
-                                              record.end,
-                                              record.impurity_right,
-                                              IS_NOT_FIRST, IS_NOT_LEFT, node,
-                                              record.depth + 1,
-                                              &split_node_right)
+                    rc = self._add_split_node(
+                        splitter=splitter,
+                        tree=tree,
+                        start=record.pos,
+                        end=record.end,
+                        impurity=record.impurity_right,
+                        is_first=IS_NOT_FIRST,
+                        is_left=IS_NOT_LEFT,
+                        parent=node,
+                        depth=record.depth + 1,
+                        lower_bound=right_child_min,
+                        upper_bound=right_child_max,
+                        res=&split_node_right,
+                    )
                     if rc == -1:
                         break
 
@@ -497,11 +618,21 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         if rc == -1:
             raise MemoryError()
 
-    cdef inline int _add_split_node(self, Splitter splitter, Tree tree,
-                                    SIZE_t start, SIZE_t end, double impurity,
-                                    bint is_first, bint is_left, Node* parent,
-                                    SIZE_t depth,
-                                    FrontierRecord* res) except -1 nogil:
+    cdef inline int _add_split_node(
+        self,
+        Splitter splitter,
+        Tree tree,
+        SIZE_t start,
+        SIZE_t end,
+        double impurity,
+        bint is_first,
+        bint is_left,
+        Node* parent,
+        SIZE_t depth,
+        double lower_bound,
+        double upper_bound,
+        FrontierRecord* res
+    ) nogil except -1:
         """Adds node w/ partition ``[start, end)`` to the frontier. """
         cdef SplitRecord split
         cdef SplitRecord* split_ptr = <SplitRecord *>malloc(splitter.pointer_size())
@@ -527,7 +658,13 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                    )
 
         if not is_leaf:
-            splitter.node_split(impurity, split_ptr, &n_constant_features)
+            splitter.node_split(
+                impurity,
+                split_ptr,
+                &n_constant_features,
+                lower_bound,
+                upper_bound
+            )
             # assign local copy of SplitRecord to assign
             # pos, improvement, and impurity scores
             split = deref(split_ptr)
@@ -548,12 +685,17 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
 
         # compute values also for split nodes (might become leafs later).
         splitter.node_value(tree.value + node_id * tree.value_stride)
+        if splitter.with_monotonic_cst:
+            splitter.clip_node_value(tree.value + node_id * tree.value_stride, lower_bound, upper_bound)
 
         res.node_id = node_id
         res.start = start
         res.end = end
         res.depth = depth
         res.impurity = impurity
+        res.lower_bound = lower_bound
+        res.upper_bound = upper_bound
+        res.middle_value = splitter.criterion.middle_value()
 
         if not is_leaf:
             # is split node
diff --git a/sklearn/tree/tests/test_monotonic_tree.py b/sklearn/tree/tests/test_monotonic_tree.py
new file mode 100644
index 0000000000000..462ac7305d7c2
--- /dev/null
+++ b/sklearn/tree/tests/test_monotonic_tree.py
@@ -0,0 +1,491 @@
+import numpy as np
+import pytest
+import scipy.sparse
+
+from sklearn.datasets import make_classification, make_regression
+from sklearn.ensemble import (
+    ExtraTreesClassifier,
+    ExtraTreesRegressor,
+    RandomForestClassifier,
+    RandomForestRegressor,
+)
+from sklearn.tree import (
+    DecisionTreeClassifier,
+    DecisionTreeRegressor,
+    ExtraTreeClassifier,
+    ExtraTreeRegressor,
+)
+
+TREE_CLASSIFIER_CLASSES = [DecisionTreeClassifier, ExtraTreeClassifier]
+TREE_REGRESSOR_CLASSES = [DecisionTreeRegressor, ExtraTreeRegressor]
+TREE_BASED_CLASSIFIER_CLASSES = TREE_CLASSIFIER_CLASSES + [
+    RandomForestClassifier,
+    ExtraTreesClassifier,
+]
+TREE_BASED_REGRESSOR_CLASSES = TREE_REGRESSOR_CLASSES + [
+    RandomForestRegressor,
+    ExtraTreesRegressor,
+]
+
+
+@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
+@pytest.mark.parametrize("depth_first_builder", (True, False))
+@pytest.mark.parametrize("sparse_splitter", (True, False))
+def test_monotonic_constraints_classifications(
+    TreeClassifier, depth_first_builder, sparse_splitter, global_random_seed
+):
+    n_samples = 1000
+    n_samples_train = 900
+    X, y = make_classification(
+        n_samples=n_samples,
+        n_classes=2,
+        n_features=5,
+        n_informative=5,
+        n_redundant=0,
+        random_state=global_random_seed,
+    )
+    X_train, y_train = X[:n_samples_train], y[:n_samples_train]
+    X_test, _ = X[n_samples_train:], y[n_samples_train:]
+
+    X_test_0incr, X_test_0decr = np.copy(X_test), np.copy(X_test)
+    X_test_1incr, X_test_1decr = np.copy(X_test), np.copy(X_test)
+    X_test_0incr[:, 0] += 10
+    X_test_0decr[:, 0] -= 10
+    X_test_1incr[:, 1] += 10
+    X_test_1decr[:, 1] -= 10
+    monotonic_cst = np.zeros(X.shape[1])
+    monotonic_cst[0] = 1
+    monotonic_cst[1] = -1
+
+    if depth_first_builder:
+        est = TreeClassifier(max_depth=None, monotonic_cst=monotonic_cst)
+    else:
+        est = TreeClassifier(
+            max_depth=None,
+            monotonic_cst=monotonic_cst,
+            max_leaf_nodes=n_samples_train,
+        )
+    if hasattr(est, "random_state"):
+        est.set_params(**{"random_state": global_random_seed})
+    if hasattr(est, "n_estimators"):
+        est.set_params(**{"n_estimators": 5})
+    if sparse_splitter:
+        X_train = scipy.sparse.csc_matrix(X_train)
+    est.fit(X_train, y_train)
+    y = est.predict_proba(X_test)[:, 1]
+
+    # Monotonic increase constraint, it applies to the positive class
+    assert np.all(est.predict_proba(X_test_0incr)[:, 1] >= y)
+    assert np.all(est.predict_proba(X_test_0decr)[:, 1] <= y)
+
+    # Monotonic decrease constraint, it applies to the positive class
+    assert np.all(est.predict_proba(X_test_1incr)[:, 1] <= y)
+    assert np.all(est.predict_proba(X_test_1decr)[:, 1] >= y)
+
+
+@pytest.mark.parametrize("TreeRegressor", TREE_BASED_REGRESSOR_CLASSES)
+@pytest.mark.parametrize("depth_first_builder", (True, False))
+@pytest.mark.parametrize("sparse_splitter", (True, False))
+@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error"))
+def test_monotonic_constraints_regressions(
+    TreeRegressor, depth_first_builder, sparse_splitter, criterion, global_random_seed
+):
+    n_samples = 1000
+    n_samples_train = 900
+    # Build a regression task using 5 informative features
+    X, y = make_regression(
+        n_samples=n_samples,
+        n_features=5,
+        n_informative=5,
+        random_state=global_random_seed,
+    )
+    train = np.arange(n_samples_train)
+    test = np.arange(n_samples_train, n_samples)
+    X_train = X[train]
+    y_train = y[train]
+    X_test = np.copy(X[test])
+    X_test_incr = np.copy(X_test)
+    X_test_decr = np.copy(X_test)
+    X_test_incr[:, 0] += 10
+    X_test_decr[:, 1] += 10
+    monotonic_cst = np.zeros(X.shape[1])
+    monotonic_cst[0] = 1
+    monotonic_cst[1] = -1
+
+    if depth_first_builder:
+        est = TreeRegressor(
+            max_depth=None,
+            monotonic_cst=monotonic_cst,
+            criterion=criterion,
+        )
+    else:
+        est = TreeRegressor(
+            max_depth=8,
+            monotonic_cst=monotonic_cst,
+            criterion=criterion,
+            max_leaf_nodes=n_samples_train,
+        )
+    if hasattr(est, "random_state"):
+        est.set_params(random_state=global_random_seed)
+    if hasattr(est, "n_estimators"):
+        est.set_params(**{"n_estimators": 5})
+    if sparse_splitter:
+        X_train = scipy.sparse.csc_matrix(X_train)
+    est.fit(X_train, y_train)
+    y = est.predict(X_test)
+    # Monotonic increase constraint
+    y_incr = est.predict(X_test_incr)
+    # y_incr should always be greater than y
+    assert np.all(y_incr >= y)
+
+    # Monotonic decrease constraint
+    y_decr = est.predict(X_test_decr)
+    # y_decr should always be lower than y
+    assert np.all(y_decr <= y)
+
+
+@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
+def test_multiclass_raises(TreeClassifier):
+    X, y = make_classification(
+        n_samples=100, n_features=5, n_classes=3, n_informative=3, random_state=0
+    )
+    y[0] = 0
+    monotonic_cst = np.zeros(X.shape[1])
+    monotonic_cst[0] = -1
+    monotonic_cst[1] = 1
+    est = TreeClassifier(max_depth=None, monotonic_cst=monotonic_cst, random_state=0)
+
+    msg = "Monotonicity constraints are not supported with multiclass classification"
+    with pytest.raises(ValueError, match=msg):
+        est.fit(X, y)
+
+
+@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
+def test_multiple_output_raises(TreeClassifier):
+    X = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]
+    y = [[1, 0, 1, 0, 1], [1, 0, 1, 0, 1]]
+
+    est = TreeClassifier(
+        max_depth=None, monotonic_cst=np.array([-1, 1]), random_state=0
+    )
+    msg = "Monotonicity constraints are not supported with multiple output"
+    with pytest.raises(ValueError, match=msg):
+        est.fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "DecisionTreeEstimator", [DecisionTreeClassifier, DecisionTreeRegressor]
+)
+def test_missing_values_raises(DecisionTreeEstimator):
+    X, y = make_classification(
+        n_samples=100, n_features=5, n_classes=2, n_informative=3, random_state=0
+    )
+    X[0, 0] = np.nan
+    monotonic_cst = np.zeros(X.shape[1])
+    monotonic_cst[0] = 1
+    est = DecisionTreeEstimator(
+        max_depth=None, monotonic_cst=monotonic_cst, random_state=0
+    )
+
+    msg = "Input X contains NaN"
+    with pytest.raises(ValueError, match=msg):
+        est.fit(X, y)
+
+
+@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
+def test_bad_monotonic_cst_raises(TreeClassifier):
+    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
+    y = [1, 0, 1, 0, 1]
+
+    msg = "monotonic_cst has shape 3 but the input data X has 2 features."
+    est = TreeClassifier(
+        max_depth=None, monotonic_cst=np.array([-1, 1, 0]), random_state=0
+    )
+    with pytest.raises(ValueError, match=msg):
+        est.fit(X, y)
+
+    msg = "monotonic_cst must be None or an array-like of -1, 0 or 1."
+    est = TreeClassifier(
+        max_depth=None, monotonic_cst=np.array([-2, 2]), random_state=0
+    )
+    with pytest.raises(ValueError, match=msg):
+        est.fit(X, y)
+
+    est = TreeClassifier(
+        max_depth=None, monotonic_cst=np.array([-1, 0.8]), random_state=0
+    )
+    with pytest.raises(ValueError, match=msg + "(.*)0.8]"):
+        est.fit(X, y)
+
+
+def assert_1d_reg_tree_children_monotonic_bounded(tree_, monotonic_sign):
+    values = tree_.value
+    for i in range(tree_.node_count):
+        if tree_.children_left[i] > i and tree_.children_right[i] > i:
+            # Check monotonicity on children
+            i_left = tree_.children_left[i]
+            i_right = tree_.children_right[i]
+            if monotonic_sign == 1:
+                assert values[i_left] <= values[i_right]
+            elif monotonic_sign == -1:
+                assert values[i_left] >= values[i_right]
+            val_middle = (values[i_left] + values[i_right]) / 2
+            # Check bounds on grand-children, filtering out leaf nodes
+            if tree_.feature[i_left] >= 0:
+                i_left_right = tree_.children_right[i_left]
+                if monotonic_sign == 1:
+                    assert values[i_left_right] <= val_middle
+                elif monotonic_sign == -1:
+                    assert values[i_left_right] >= val_middle
+            if tree_.feature[i_right] >= 0:
+                i_right_left = tree_.children_left[i_right]
+                if monotonic_sign == 1:
+                    assert val_middle <= values[i_right_left]
+                elif monotonic_sign == -1:
+                    assert val_middle >= values[i_right_left]
+
+
+def test_assert_1d_reg_tree_children_monotonic_bounded():
+    X = np.linspace(-1, 1, 7).reshape(-1, 1)
+    y = np.sin(2 * np.pi * X.ravel())
+
+    reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y)
+
+    with pytest.raises(AssertionError):
+        assert_1d_reg_tree_children_monotonic_bounded(reg.tree_, 1)
+
+    with pytest.raises(AssertionError):
+        assert_1d_reg_tree_children_monotonic_bounded(reg.tree_, -1)
+
+
+def assert_1d_reg_monotonic(clf, monotonic_sign, min_x, max_x, n_steps):
+    X_grid = np.linspace(min_x, max_x, n_steps).reshape(-1, 1)
+    y_pred_grid = clf.predict(X_grid)
+    if monotonic_sign == 1:
+        assert (np.diff(y_pred_grid) >= 0.0).all()
+    elif monotonic_sign == -1:
+        assert (np.diff(y_pred_grid) <= 0.0).all()
+
+
+@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES)
+def test_1d_opposite_monotonicity_cst_data(TreeRegressor):
+    # Check that positive monotonic data with negative monotonic constraint
+    # yield constant predictions, equal to the average of target values
+    X = np.linspace(-2, 2, 10).reshape(-1, 1)
+    y = X.ravel()
+    clf = TreeRegressor(monotonic_cst=[-1])
+    clf.fit(X, y)
+    assert clf.tree_.node_count == 1
+    assert clf.tree_.value[0] == 0.0
+
+    # Swap monotonicity
+    clf = TreeRegressor(monotonic_cst=[1])
+    clf.fit(X, -y)
+    assert clf.tree_.node_count == 1
+    assert clf.tree_.value[0] == 0.0
+
+
+@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES)
+@pytest.mark.parametrize("monotonic_sign", (-1, 1))
+@pytest.mark.parametrize("depth_first_builder", (True, False))
+@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error"))
+def test_1d_tree_nodes_values(
+    TreeRegressor, monotonic_sign, depth_first_builder, criterion, global_random_seed
+):
+    # Adaptation from test_nodes_values in test_monotonic_constraints.py
+    # in sklearn.ensemble._hist_gradient_boosting
+    # Build a single tree with only one feature, and make sure the node
+    # values respect the monotonicity constraints.
+
+    # Considering the following tree with a monotonic +1 constraint, we
+    # should have:
+    #
+    #       root
+    #      /    \
+    #     a      b
+    #    / \    / \
+    #   c   d  e   f
+    #
+    #        a <=  root  <= b
+    # c <= d <= (a + b) / 2 <= e <= f
+
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 1000
+    n_features = 1
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+
+    if depth_first_builder:
+        # No max_leaf_nodes, default depth first tree builder
+        clf = TreeRegressor(
+            monotonic_cst=[monotonic_sign],
+            criterion=criterion,
+            random_state=global_random_seed,
+        )
+    else:
+        # max_leaf_nodes triggers best first tree builder
+        clf = TreeRegressor(
+            monotonic_cst=[monotonic_sign],
+            max_leaf_nodes=n_samples,
+            criterion=criterion,
+            random_state=global_random_seed,
+        )
+    clf.fit(X, y)
+
+    assert_1d_reg_tree_children_monotonic_bounded(clf.tree_, monotonic_sign)
+    assert_1d_reg_monotonic(clf, monotonic_sign, np.min(X), np.max(X), 100)
+
+
+def assert_nd_reg_tree_children_monotonic_bounded(tree_, monotonic_cst):
+    upper_bound = np.full(tree_.node_count, np.inf)
+    lower_bound = np.full(tree_.node_count, -np.inf)
+    for i in range(tree_.node_count):
+        feature = tree_.feature[i]
+        node_value = tree_.value[i][0][0]  # unpack value from nx1x1 array
+        # While building the tree, the computed middle value is slightly
+        # different from the average of the siblings values, because
+        # sum_right / weighted_n_right
+        # is slightly different from the value of the right sibling.
+        # This can cause a discrepancy up to numerical noise when clipping,
+        # which is resolved by comparing with some loss of precision.
+        assert np.float32(node_value) <= np.float32(upper_bound[i])
+        assert np.float32(node_value) >= np.float32(lower_bound[i])
+
+        if feature < 0:
+            # Leaf: nothing to do
+            continue
+
+        # Split node: check and update bounds for the children.
+        i_left = tree_.children_left[i]
+        i_right = tree_.children_right[i]
+        # unpack value from nx1x1 array
+        middle_value = (tree_.value[i_left][0][0] + tree_.value[i_right][0][0]) / 2
+
+        if monotonic_cst[feature] == 0:
+            # Feature without monotonicity constraint: propagate bounds
+            # down the tree to both children.
+            # Otherwise, with 2 features and a monotonic increase constraint
+            # (encoded by +1) on feature 0, the following tree can be accepted,
+            # although it does not respect the monotonic increase constraint:
+            #
+            #                      X[0] <= 0
+            #                      value = 100
+            #                     /            \
+            #          X[0] <= -1                X[1] <= 0
+            #          value = 50                value = 150
+            #        /            \             /            \
+            #    leaf           leaf           leaf          leaf
+            #    value = 25     value = 75     value = 50    value = 250
+
+            lower_bound[i_left] = lower_bound[i]
+            upper_bound[i_left] = upper_bound[i]
+            lower_bound[i_right] = lower_bound[i]
+            upper_bound[i_right] = upper_bound[i]
+
+        elif monotonic_cst[feature] == 1:
+            # Feature with constraint: check monotonicity
+            assert tree_.value[i_left] <= tree_.value[i_right]
+
+            # Propagate bounds down the tree to both children.
+            lower_bound[i_left] = lower_bound[i]
+            upper_bound[i_left] = middle_value
+            lower_bound[i_right] = middle_value
+            upper_bound[i_right] = upper_bound[i]
+
+        elif monotonic_cst[feature] == -1:
+            # Feature with constraint: check monotonicity
+            assert tree_.value[i_left] >= tree_.value[i_right]
+
+            # Update and propagate bounds down the tree to both children.
+            lower_bound[i_left] = middle_value
+            upper_bound[i_left] = upper_bound[i]
+            lower_bound[i_right] = lower_bound[i]
+            upper_bound[i_right] = middle_value
+
+        else:  # pragma: no cover
+            raise ValueError(f"monotonic_cst[{feature}]={monotonic_cst[feature]}")
+
+
+def test_assert_nd_reg_tree_children_monotonic_bounded():
+    # Check that assert_nd_reg_tree_children_monotonic_bounded can detect
+    # non-monotonic tree predictions.
+    X = np.linspace(0, 2 * np.pi, 30).reshape(-1, 1)
+    y = np.sin(X).ravel()
+    reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y)
+
+    with pytest.raises(AssertionError):
+        assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [1])
+
+    with pytest.raises(AssertionError):
+        assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [-1])
+
+    assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [0])
+
+    # Check that assert_nd_reg_tree_children_monotonic_bounded raises
+    # when the data (and therefore the model) is naturally monotonic in the
+    # opposite direction.
+    X = np.linspace(-5, 5, 5).reshape(-1, 1)
+    y = X.ravel() ** 3
+    reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y)
+
+    with pytest.raises(AssertionError):
+        assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [-1])
+
+    # For completeness, check that the converse holds when swapping the sign.
+    reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, -y)
+
+    with pytest.raises(AssertionError):
+        assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [1])
+
+
+@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES)
+@pytest.mark.parametrize("monotonic_sign", (-1, 1))
+@pytest.mark.parametrize("depth_first_builder", (True, False))
+@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error"))
+def test_nd_tree_nodes_values(
+    TreeRegressor, monotonic_sign, depth_first_builder, criterion, global_random_seed
+):
+    # Build tree with several features, and make sure the nodes
+    # values respect the monotonicity constraints.
+
+    # Considering the following tree with a monotonic increase constraint on X[0],
+    # we should have:
+    #
+    #            root
+    #           X[0]<=t
+    #          /       \
+    #         a         b
+    #     X[0]<=u   X[1]<=v
+    #    /       \   /     \
+    #   c        d  e       f
+    #
+    # i)   a <= root <= b
+    # ii)  c <= a <= d <= (a+b)/2
+    # iii) (a+b)/2 <= min(e,f)
+    # For iii) we check that each node value is within the proper lower and
+    # upper bounds.
+
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 1000
+    n_features = 2
+    monotonic_cst = [monotonic_sign, 0]
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+
+    if depth_first_builder:
+        # No max_leaf_nodes, default depth first tree builder
+        clf = TreeRegressor(
+            monotonic_cst=monotonic_cst,
+            criterion=criterion,
+            random_state=global_random_seed,
+        )
+    else:
+        # max_leaf_nodes triggers best first tree builder
+        clf = TreeRegressor(
+            monotonic_cst=monotonic_cst,
+            max_leaf_nodes=n_samples,
+            criterion=criterion,
+            random_state=global_random_seed,
+        )
+    clf.fit(X, y)
+    assert_nd_reg_tree_children_monotonic_bounded(clf.tree_, monotonic_cst)
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 0ce7a548c7bdb..ccca6d60ed48b 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -2371,7 +2371,7 @@ def test_splitter_serializable(Splitter):
     n_outputs, n_classes = 2, np.array([3, 2], dtype=np.intp)
 
     criterion = CRITERIA_CLF["gini"](n_outputs, n_classes)
-    splitter = Splitter(criterion, max_features, 5, 0.5, rng)
+    splitter = Splitter(criterion, max_features, 5, 0.5, rng, monotonic_cst=None)
     splitter_serialize = pickle.dumps(splitter)
 
     splitter_back = pickle.loads(splitter_serialize)

From 34e540a98d3103388003ff8aa3bb2066404a5f31 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 5 Jul 2023 11:00:37 -0400
Subject: [PATCH 30/39] Fix splitter

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_splitter.pyx | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index af59a594793b8..429f49947c47d 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -110,7 +110,7 @@ cdef class BaseSplitter:
         """Copy the value of node samples[start:end] into dest."""
         pass
 
-    cdef inline void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil:
+    cdef void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil:
         """Clip the value of node samples[start:end] into dest."""
         pass
 
@@ -310,11 +310,6 @@ cdef class Splitter(BaseSplitter):
         """Copy the samples[start:end] into dest."""
         self.criterion.node_samples(dest)
 
-    cdef inline void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil:
-        """Clip the value in dest between lower_bound and upper_bound for monotonic constraints."""
-
-        self.criterion.clip_node_value(dest, lower_bound, upper_bound)
-
     cdef double node_impurity(self) noexcept nogil:
         """Return the impurity of the current node."""
 

From e9d702b8a38824f67752b9a5a6aefb964b511551 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 19 Jul 2023 16:19:49 -0400
Subject: [PATCH 31/39] Fix linter

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 1d3b247bd4586..7e494b0e9bccc 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -2290,7 +2290,7 @@ class ExtraTreesClassifier(ForestClassifier):
           `max_samples` should be in the interval `(0.0, 1.0]`.
 
         .. versionadded:: 0.22
-    
+
     max_bins : int, default=255
         The maximum number of bins to use for non-missing values.
 

From ce6a727f9ddd814c90997eae2e7aae441566a18f Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 20 Jul 2023 10:56:41 -0400
Subject: [PATCH 32/39] Fix linting

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 7e494b0e9bccc..7f2c5d44c2c1a 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -777,16 +777,21 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
 
             # (n_total_leaf_samples, n_outputs)
             leaf_node_samples = np.vstack(
-                (
+                [
                     est.leaf_nodes_samples_[leaf_nodes[jdx]]
                     for jdx, est in enumerate(self.estimators_)
-                )
+                ]
             )
 
             # get quantiles across all leaf node samples
-            y_hat[idx, ...] = np.quantile(
-                leaf_node_samples, quantiles, axis=0, interpolation=method
-            )
+            try:
+                y_hat[idx, ...] = np.quantile(
+                    leaf_node_samples, quantiles, axis=0, method=method
+                )
+            except TypeError:
+                y_hat[idx, ...] = np.quantile(
+                    leaf_node_samples, quantiles, axis=0, interpolation=method
+                )
 
             if is_classifier(self):
                 if self.n_outputs_ == 1:

From 00a3595b973f01e25d9ee50eedd0504b89096c8e Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 20 Jul 2023 11:21:20 -0400
Subject: [PATCH 33/39] Fix docstring

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 7f2c5d44c2c1a..4cad13b6c7658 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -1533,9 +1533,13 @@ class RandomForestClassifier(ForestClassifier):
     max_bins : int, default=255
         The maximum number of bins to use for non-missing values.
 
+        **Experimental feature**
+
     store_leaf_values : bool, default=False
         Whether to store the leaf values in the ``get_leaf_node_samples`` function.
 
+        **Experimental feature**
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonic increase
@@ -1921,9 +1925,13 @@ class RandomForestRegressor(ForestRegressor):
         The maximum number of bins to use for non-missing values. Used for
         speeding up training time.
 
+        **Experimental feature**
+
     store_leaf_values : bool, default=False
         Whether to store the leaf values in the ``get_leaf_node_samples`` function.
 
+        **Experimental feature**
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonically increasing
@@ -2299,9 +2307,13 @@ class ExtraTreesClassifier(ForestClassifier):
     max_bins : int, default=255
         The maximum number of bins to use for non-missing values.
 
+        **Experimental feature**
+
     store_leaf_values : bool, default=False
         Whether to store the leaf values in the ``get_leaf_node_samples`` function.
 
+        **Experimental feature**
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonically increasing
@@ -2690,9 +2702,13 @@ class ExtraTreesRegressor(ForestRegressor):
     max_bins : int, default=255
         The maximum number of bins to use for non-missing values.
 
+        **Experimental feature**
+
     store_leaf_values : bool, default=False
         Whether to store the leaf values in the ``get_leaf_node_samples`` function.
 
+        **Experimental feature**
+        
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonically increasing

From 329cbc89ff19991429f042b72909bfd412ed4c63 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 20 Jul 2023 13:25:00 -0400
Subject: [PATCH 34/39] Fix lint

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 min_dependency_substitutions.rst | 28 ++++++++++++++++++++++++++++
 min_dependency_table.rst         | 32 ++++++++++++++++++++++++++++++++
 sklearn/ensemble/_forest.py      |  2 +-
 3 files changed, 61 insertions(+), 1 deletion(-)
 create mode 100644 min_dependency_substitutions.rst
 create mode 100644 min_dependency_table.rst

diff --git a/min_dependency_substitutions.rst b/min_dependency_substitutions.rst
new file mode 100644
index 0000000000000..575b003b15a32
--- /dev/null
+++ b/min_dependency_substitutions.rst
@@ -0,0 +1,28 @@
+.. |NumpyMinVersion| replace:: 1.17.3
+.. |ScipyMinVersion| replace:: 1.5.0
+.. |JoblibMinVersion| replace:: 1.1.1
+.. |ThreadpoolctlMinVersion| replace:: 2.0.0
+.. |CythonMinVersion| replace:: 0.29.33
+.. |MatplotlibMinVersion| replace:: 3.1.3
+.. |Scikit-imageMinVersion| replace:: 0.16.2
+.. |PandasMinVersion| replace:: 1.0.5
+.. |SeabornMinVersion| replace:: 0.9.0
+.. |Memory_profilerMinVersion| replace:: 0.57.0
+.. |PytestMinVersion| replace:: 7.1.2
+.. |Pytest-covMinVersion| replace:: 2.9.0
+.. |RuffMinVersion| replace:: 0.0.272
+.. |BlackMinVersion| replace:: 23.3.0
+.. |MypyMinVersion| replace:: 1.3
+.. |PyamgMinVersion| replace:: 4.0.0
+.. |PolarsMinVersion| replace:: 0.18.2
+.. |PyarrowMinVersion| replace:: 12.0.0
+.. |SphinxMinVersion| replace:: 6.0.0
+.. |Sphinx-copybuttonMinVersion| replace:: 0.5.2
+.. |Sphinx-galleryMinVersion| replace:: 0.10.1
+.. |NumpydocMinVersion| replace:: 1.2.0
+.. |PillowMinVersion| replace:: 7.1.2
+.. |PoochMinVersion| replace:: 1.6.0
+.. |Sphinx-promptMinVersion| replace:: 1.3.0
+.. |Sphinxext-opengraphMinVersion| replace:: 0.4.2
+.. |PlotlyMinVersion| replace:: 5.14.0
+.. |Conda-lockMinVersion| replace:: 2.1.1
diff --git a/min_dependency_table.rst b/min_dependency_table.rst
new file mode 100644
index 0000000000000..3a223a0fef797
--- /dev/null
+++ b/min_dependency_table.rst
@@ -0,0 +1,32 @@
+======================= =================== ====================================
+Dependency              Minimum Version     Purpose
+======================= =================== ====================================
+numpy                   1.17.3              build, install
+scipy                   1.5.0               build, install
+joblib                  1.1.1               install
+threadpoolctl           2.0.0               install
+cython                  0.29.33             build
+matplotlib              3.1.3               benchmark, docs, examples, tests
+scikit-image            0.16.2              docs, examples, tests
+pandas                  1.0.5               benchmark, docs, examples, tests
+seaborn                 0.9.0               docs, examples
+memory_profiler         0.57.0              benchmark, docs
+pytest                  7.1.2               tests
+pytest-cov              2.9.0               tests
+ruff                    0.0.272             tests
+black                   23.3.0              tests
+mypy                    1.3                 tests
+pyamg                   4.0.0               tests
+polars                  0.18.2              tests
+pyarrow                 12.0.0              tests
+sphinx                  6.0.0               docs
+sphinx-copybutton       0.5.2               docs
+sphinx-gallery          0.10.1              docs
+numpydoc                1.2.0               docs, tests
+Pillow                  7.1.2               docs
+pooch                   1.6.0               docs, examples, tests
+sphinx-prompt           1.3.0               docs
+sphinxext-opengraph     0.4.2               docs
+plotly                  5.14.0              docs, examples
+conda-lock              2.1.1               maintenance
+======================= =================== ====================================
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 4cad13b6c7658..66af09d79f203 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -2708,7 +2708,7 @@ class ExtraTreesRegressor(ForestRegressor):
         Whether to store the leaf values in the ``get_leaf_node_samples`` function.
 
         **Experimental feature**
-        
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonically increasing

From 8b5d0f9e5d5b13016c5a3444f632469e8c799f7a Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 20 Jul 2023 14:13:49 -0400
Subject: [PATCH 35/39] Fix unit test

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 1b718f3a04052..fa68f18921636 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -724,9 +724,14 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest", check_input=True
         for idx, leaf_id in enumerate(X_leaves):
             # predict by taking the quantile across the samples in the leaf for
             # each output
-            proba[idx, ...] = np.quantile(
-                leaf_samples[leaf_id], quantiles, axis=0, interpolation=method
-            )
+            try:
+                proba[idx, ...] = np.quantile(
+                    leaf_samples[leaf_id], quantiles, axis=0, method=method
+                )
+            except TypeError:
+                proba[idx, ...] = np.quantile(
+                    leaf_samples[leaf_id], quantiles, axis=0, interpolation=method
+                )
 
         # Classification
         if is_classifier(self):

From 38bade77502b16a203cf6448d9badabcef5d69fa Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 20 Jul 2023 16:37:52 -0400
Subject: [PATCH 36/39] Fix lint

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 38 +++++++++++--------------------------
 1 file changed, 11 insertions(+), 27 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 66af09d79f203..4e33fc07d85cc 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -1533,12 +1533,12 @@ class RandomForestClassifier(ForestClassifier):
     max_bins : int, default=255
         The maximum number of bins to use for non-missing values.
 
-        **Experimental feature**
+        **This is an experimental feature**.
 
     store_leaf_values : bool, default=False
         Whether to store the leaf values in the ``get_leaf_node_samples`` function.
 
-        **Experimental feature**
+        **This is an experimental feature**.
 
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
@@ -1925,12 +1925,12 @@ class RandomForestRegressor(ForestRegressor):
         The maximum number of bins to use for non-missing values. Used for
         speeding up training time.
 
-        **Experimental feature**
+        **This is an experimental feature**.
 
     store_leaf_values : bool, default=False
         Whether to store the leaf values in the ``get_leaf_node_samples`` function.
 
-        **Experimental feature**
+        **This is an experimental feature**.
 
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
@@ -2307,31 +2307,12 @@ class ExtraTreesClassifier(ForestClassifier):
     max_bins : int, default=255
         The maximum number of bins to use for non-missing values.
 
-        **Experimental feature**
+        **This is an experimental feature**.
 
     store_leaf_values : bool, default=False
         Whether to store the leaf values in the ``get_leaf_node_samples`` function.
 
-        **Experimental feature**
-
-    monotonic_cst : array-like of int of shape (n_features), default=None
-        Indicates the monotonicity constraint to enforce on each feature.
-          - 1: monotonically increasing
-          - 0: no constraint
-          - -1: monotonically decreasing
-
-        If monotonic_cst is None, no constraints are applied.
-
-        Monotonicity constraints are not supported for:
-          - multiclass classifications (i.e. when `n_classes > 2`),
-          - multioutput classifications (i.e. when `n_outputs_ > 1`),
-          - classifications trained on data with missing values.
-
-        The constraints hold over the probability of the positive class.
-
-        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
-
-        .. versionadded:: 1.4
+        **This is an experimental feature**.
 
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
@@ -2702,12 +2683,12 @@ class ExtraTreesRegressor(ForestRegressor):
     max_bins : int, default=255
         The maximum number of bins to use for non-missing values.
 
-        **Experimental feature**
+        **This is an experimental feature**.
 
     store_leaf_values : bool, default=False
         Whether to store the leaf values in the ``get_leaf_node_samples`` function.
 
-        **Experimental feature**
+        **This is an experimental feature**.
 
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
@@ -2990,6 +2971,9 @@ class RandomTreesEmbedding(TransformerMixin, BaseForest):
         new forest. See :term:`Glossary <warm_start>` and
         :ref:`gradient_boosting_warm_start` for details.
 
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance

From feffdeb35834a9258348dec525820b615dd03fd0 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 20 Jul 2023 19:45:39 -0400
Subject: [PATCH 37/39] Adding fix

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index fa68f18921636..091c7e9b4c002 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -416,9 +416,6 @@ def _fit(
             random_state,
         )
 
-        if self.store_leaf_values:
-            self.leaf_nodes_samples_ = self.tree_.leaf_nodes_samples
-
         return self
 
     def _build_tree(
@@ -1118,9 +1115,6 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
         for basic usage of these attributes.
 
-    leaf_nodes_samples_ : dict
-        A dictionary of leaf node index and the y_train samples in that leaf.
-
     See Also
     --------
     DecisionTreeRegressor : A decision tree regressor.

From 2bb5f1c13e08e811bf6868e6cd7d44d7337e8cd4 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 21 Jul 2023 11:46:50 -0400
Subject: [PATCH 38/39] Fixed

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 2 +-
 sklearn/tree/_classes.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 4e33fc07d85cc..f4a574c62c5e9 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -778,7 +778,7 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
             # (n_total_leaf_samples, n_outputs)
             leaf_node_samples = np.vstack(
                 [
-                    est.leaf_nodes_samples_[leaf_nodes[jdx]]
+                    est.tree_.leaf_nodes_samples[leaf_nodes[jdx]]
                     for jdx, est in enumerate(self.estimators_)
                 ]
             )
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 091c7e9b4c002..7482fd6022e50 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -651,7 +651,7 @@ def get_leaf_node_samples(self, X, check_input=True):
 
         Returns
         -------
-        leaf_nodes_samples : a list of array-like
+        leaf_nodes_samples : a list of array-like of length (n_samples,)
             Each sample is represented by the indices of the training samples that
             reached the leaf node. The ``n_leaf_node_samples`` may vary between
             samples, since the number of samples that fall in a leaf node is

From 6ec023b366065100b8e61261f139cb57f587bbec Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 11 Aug 2023 10:33:48 -0400
Subject: [PATCH 39/39] [MERGE] Merge changes from sklearn main (#52)

Merging latest changes from sklearn main

#### What does this implement/fix? Explain your changes.


#### Any other comments?


<!--
Please be aware that we are a loose team of volunteers so patience is
necessary; assistance handling other issues is very welcome. We value
all user contributions, no matter how minor they are. If we are slow to
review, either the pull request needs some benchmarking, tinkering,
convincing, etc. or more likely the reviewers are simply busy. In either
case, we ask for your understanding during the review process.
For more information, see our FAQ on this topic:

http://scikit-learn.org/dev/faq.html#why-is-my-pull-request-not-getting-any-attention.

Thanks for contributing!
-->

---------

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 .cirrus.star                                  |  12 +-
 .github/workflows/wheels.yml                  |  12 +
 .gitignore                                    |   3 +
 azure-pipelines.yml                           |   1 -
 build_tools/azure/posix-docker.yml            |   1 -
 build_tools/azure/posix.yml                   |   1 -
 build_tools/azure/test_script.sh              |   6 +-
 build_tools/cirrus/arm_tests.yml              |   8 +-
 build_tools/cirrus/arm_wheel.yml              |  12 +-
 build_tools/cirrus/build_test_arm.sh          |   2 +-
 .../update_environments_and_lock_files.py     |   6 +-
 doc/developers/contributing.rst               |   1 +
 doc/glossary.rst                              |  23 +
 doc/install.rst                               |  12 +-
 doc/modules/array_api.rst                     |  19 +-
 doc/modules/compose.rst                       |   6 +-
 doc/modules/cross_validation.rst              |   1 +
 doc/modules/ensemble.rst                      |   6 +-
 doc/modules/linear_model.rst                  |  12 +-
 doc/modules/neighbors.rst                     |   8 +-
 doc/modules/preprocessing.rst                 |  16 +-
 doc/modules/svm.rst                           |  34 +-
 doc/modules/tree.rst                          |   4 +-
 doc/related_projects.rst                      |  95 ++--
 .../scikit-learn-modern/static/css/theme.css  |   8 +-
 .../machine_learning_map/pyparsing.py         |   2 +-
 doc/whats_new/v0.22.rst                       |   2 +-
 doc/whats_new/v1.3.rst                        |  36 ++
 doc/whats_new/v1.4.rst                        |  96 ++++
 .../plot_classifier_comparison.py             |  16 +-
 ...ot_forest_hist_grad_boosting_comparison.py |   4 +-
 .../plot_select_from_model_diabetes.py        |  54 +-
 .../miscellaneous/plot_metadata_routing.py    |   8 +-
 examples/neighbors/plot_classification.py     | 107 ++--
 .../plot_target_encoder_cross_val.py          | 137 +++--
 .../plot_release_highlights_0_23_0.py         |   2 +-
 .../plot_release_highlights_0_24_0.py         |   2 +-
 .../plot_release_highlights_1_0_0.py          |   2 +-
 .../plot_release_highlights_1_1_0.py          |   2 +-
 .../plot_release_highlights_1_2_0.py          |   2 +-
 .../plot_release_highlights_1_3_0.py          |   2 +-
 pyproject.toml                                |   2 +-
 setup.cfg                                     |   4 +-
 setup.py                                      |  18 +-
 sklearn/_loss/loss.py                         |  12 +-
 sklearn/calibration.py                        |  14 +-
 sklearn/cluster/_dbscan.py                    |  12 +-
 sklearn/cluster/_hdbscan/hdbscan.py           |  60 ++-
 sklearn/cluster/tests/test_hdbscan.py         |  69 ++-
 sklearn/cross_decomposition/_pls.py           |   7 +-
 sklearn/cross_decomposition/tests/test_pls.py |  23 +
 sklearn/ensemble/_forest.py                   |  57 ++-
 sklearn/ensemble/_gb.py                       |  48 +-
 .../gradient_boosting.py                      |  16 +-
 sklearn/ensemble/_stacking.py                 |   2 +-
 sklearn/ensemble/tests/test_forest.py         |  91 ++++
 sklearn/feature_selection/_rfe.py             |   7 +-
 sklearn/impute/_knn.py                        |   7 +-
 sklearn/impute/tests/test_common.py           |  36 ++
 sklearn/linear_model/_logistic.py             |  22 +-
 sklearn/metrics/_dist_metrics.pxd.tp          |  40 +-
 sklearn/metrics/_dist_metrics.pyx.tp          | 481 +++++++++++-------
 .../_argkmin.pyx.tp                           |   2 +-
 .../_argkmin_classmode.pyx.tp                 |  34 +-
 .../_classmode.pxd                            |   5 +
 .../_datasets_pair.pxd.tp                     |  14 +-
 .../_datasets_pair.pyx.tp                     |  35 +-
 .../_dispatcher.py                            |  34 +-
 sklearn/metrics/_scorer.py                    |   9 +-
 sklearn/metrics/tests/test_classification.py  |   8 +-
 sklearn/metrics/tests/test_dist_metrics.py    |  32 +-
 .../test_pairwise_distances_reduction.py      |  48 +-
 sklearn/metrics/tests/test_score_objects.py   |  20 +-
 sklearn/mixture/_gaussian_mixture.py          |  95 +++-
 .../mixture/tests/test_gaussian_mixture.py    |  85 ++++
 sklearn/model_selection/_search.py            |   2 +
 .../_search_successive_halving.py             |   8 +-
 sklearn/model_selection/_split.py             |   2 +-
 sklearn/model_selection/_validation.py        | 303 +++++++++--
 sklearn/model_selection/tests/test_search.py  |  39 +-
 sklearn/model_selection/tests/test_split.py   |  80 ++-
 .../tests/test_successive_halving.py          |  73 ++-
 .../model_selection/tests/test_validation.py  | 230 ++++++++-
 sklearn/multioutput.py                        |  16 +-
 sklearn/neighbors/_ball_tree.pyx              | 195 -------
 sklearn/neighbors/_ball_tree.pyx.tp           | 284 +++++++++++
 sklearn/neighbors/_base.py                    |  20 +-
 .../{_binary_tree.pxi => _binary_tree.pxi.tp} | 382 ++++++++------
 sklearn/neighbors/_classification.py          |   4 +-
 .../{_kd_tree.pyx => _kd_tree.pyx.tp}         | 173 +++++--
 sklearn/neighbors/_partition_nodes.pxd        |   3 +-
 sklearn/neighbors/_partition_nodes.pyx        |   4 +-
 sklearn/neighbors/_regression.py              |   7 +-
 sklearn/neighbors/tests/test_ball_tree.py     | 131 ++++-
 sklearn/neighbors/tests/test_kd_tree.py       |  80 ++-
 sklearn/neighbors/tests/test_neighbors.py     | 114 ++++-
 .../neighbors/tests/test_neighbors_tree.py    |   4 +-
 sklearn/pipeline.py                           |  22 +-
 sklearn/preprocessing/_encoders.py            |  25 +-
 sklearn/preprocessing/_label.py               |  18 +-
 sklearn/preprocessing/_target_encoder.py      |  34 +-
 sklearn/preprocessing/tests/test_encoders.py  |  20 +
 .../tests/test_function_transformer.py        |   5 +-
 sklearn/preprocessing/tests/test_label.py     |  14 +
 sklearn/svm/_base.py                          |  26 +-
 sklearn/svm/_classes.py                       |  74 ++-
 sklearn/tests/metadata_routing_common.py      | 407 +++++++++++++++
 sklearn/tests/test_metadata_routing.py        | 383 ++++----------
 .../test_metaestimators_metadata_routing.py   | 200 +-------
 sklearn/tests/test_pipeline.py                |  29 +-
 sklearn/tree/_classes.py                      |   8 +-
 sklearn/tree/_export.py                       |   4 +-
 sklearn/utils/__init__.py                     |   2 +-
 sklearn/utils/_array_api.py                   |   3 +
 sklearn/utils/_encode.py                      |   2 +-
 sklearn/utils/_estimator_html_repr.py         |  66 ++-
 sklearn/utils/_metadata_requests.py           |  70 ++-
 sklearn/utils/_set_output.py                  |   4 +-
 sklearn/utils/_testing.py                     |  28 +-
 sklearn/utils/estimator_checks.py             |  13 +-
 sklearn/utils/fixes.py                        |   8 +
 sklearn/utils/multiclass.py                   |   9 +-
 .../utils/tests/test_estimator_html_repr.py   |   3 +
 sklearn/utils/tests/test_pprint.py            |   2 +-
 sklearn/utils/tests/test_set_output.py        |  29 ++
 125 files changed, 4085 insertions(+), 1809 deletions(-)
 create mode 100644 sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd
 delete mode 100644 sklearn/neighbors/_ball_tree.pyx
 create mode 100644 sklearn/neighbors/_ball_tree.pyx.tp
 rename sklearn/neighbors/{_binary_tree.pxi => _binary_tree.pxi.tp} (90%)
 rename sklearn/neighbors/{_kd_tree.pyx => _kd_tree.pyx.tp} (65%)
 create mode 100644 sklearn/tests/metadata_routing_common.py

diff --git a/.cirrus.star b/.cirrus.star
index 2dd1e50144987..7a432556c1299 100644
--- a/.cirrus.star
+++ b/.cirrus.star
@@ -14,7 +14,7 @@ def main(ctx):
 
     # Nightly jobs always run
     if env.get("CIRRUS_CRON", "") == "nightly":
-        return fs.read(arm_wheel_yaml)
+        return fs.read(arm_wheel_yaml) + fs.read(arm_tests_yaml)
 
     # Get commit message for event. We can not use `git` here because there is
     # no command line access in starlark. Thus we need to query the GitHub API
@@ -26,10 +26,12 @@ def main(ctx):
     response = http.get(url).json()
     commit_msg = response["message"]
 
-    if "[skip ci]" in commit_msg:
-        return []
+    jobs_to_run = ""
 
     if "[cd build]" in commit_msg or "[cd build cirrus]" in commit_msg:
-        return fs.read(arm_wheel_yaml) + fs.read(arm_tests_yaml)
+        jobs_to_run += fs.read(arm_wheel_yaml)
+
+    if "[cirrus arm]" in commit_msg:
+        jobs_to_run += fs.read(arm_tests_yaml)
 
-    return fs.read(arm_tests_yaml)
+    return jobs_to_run
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 4ab75fd361586..4300db6c5e208 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -103,6 +103,18 @@ jobs:
             python: 311
             platform_id: macosx_x86_64
 
+          # MacOS arm64
+          # The latest Python version is built and tested on CirrusCI
+          - os: macos-latest
+            python: 38
+            platform_id: macosx_arm64
+          - os: macos-latest
+            python: 39
+            platform_id: macosx_arm64
+          - os: macos-latest
+            python: 310
+            platform_id: macosx_arm64
+
     steps:
       - name: Checkout scikit-learn
         uses: actions/checkout@v3
diff --git a/.gitignore b/.gitignore
index 5296f46280e4d..cfc13d4997b4b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -100,6 +100,9 @@ sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
 sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
 sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
 sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
+sklearn/neighbors/_ball_tree.pyx
+sklearn/neighbors/_binary_tree.pxi
+sklearn/neighbors/_kd_tree.pyx
 
 # Default JupyterLite content
 jupyterlite_contents
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 6c3511319e4eb..464096fb69c29 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -171,7 +171,6 @@ jobs:
         DISTRIB: 'conda'
         LOCK_FILE: './build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock'
         COVERAGE: 'true'
-        SHOW_SHORT_SUMMARY: 'true'
         SKLEARN_TESTS_GLOBAL_RANDOM_SEED: '42'  # default global random seed
 
 # Check compilation with Ubuntu 22.04 LTS (Jammy Jellyfish) and scipy from conda-forge
diff --git a/build_tools/azure/posix-docker.yml b/build_tools/azure/posix-docker.yml
index af776c4c62f14..b00ca66c378ca 100644
--- a/build_tools/azure/posix-docker.yml
+++ b/build_tools/azure/posix-docker.yml
@@ -22,7 +22,6 @@ jobs:
     # Set in azure-pipelines.yml
     DISTRIB: ''
     DOCKER_CONTAINER: ''
-    SHOW_SHORT_SUMMARY: 'false'
     CREATE_ISSUE_ON_TRACKER: 'true'
     CCACHE_DIR: $(Pipeline.Workspace)/ccache
     CCACHE_COMPRESS: '1'
diff --git a/build_tools/azure/posix.yml b/build_tools/azure/posix.yml
index 2ee03daafd288..35e5165d22c83 100644
--- a/build_tools/azure/posix.yml
+++ b/build_tools/azure/posix.yml
@@ -22,7 +22,6 @@ jobs:
     PYTEST_XDIST_VERSION: 'latest'
     COVERAGE: 'true'
     CREATE_ISSUE_ON_TRACKER: 'true'
-    SHOW_SHORT_SUMMARY: 'false'
   strategy:
     matrix:
       ${{ insert }}: ${{ parameters.matrix }}
diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh
index 98ac2e797b73c..5117473ea6366 100755
--- a/build_tools/azure/test_script.sh
+++ b/build_tools/azure/test_script.sh
@@ -49,7 +49,7 @@ if [[ "$COVERAGE" == "true" ]]; then
 fi
 
 if [[ -n "$CHECK_WARNINGS" ]]; then
-    TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning -Werror::numpy.VisibleDeprecationWarning"
+    TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning -Werror::sklearn.utils.fixes.VisibleDeprecationWarning"
 
     # numpy's 1.19.0's tostring() deprecation is ignored until scipy and joblib
     # removes its usage
@@ -75,10 +75,6 @@ if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then
     TEST_CMD="$TEST_CMD -n$XDIST_WORKERS"
 fi
 
-if [[ "$SHOW_SHORT_SUMMARY" == "true" ]]; then
-    TEST_CMD="$TEST_CMD -ra"
-fi
-
 if [[ -n "$SELECTED_TESTS" ]]; then
     TEST_CMD="$TEST_CMD -k $SELECTED_TESTS"
 
diff --git a/build_tools/cirrus/arm_tests.yml b/build_tools/cirrus/arm_tests.yml
index a6e5919ecc32f..d1ac551a749e3 100644
--- a/build_tools/cirrus/arm_tests.yml
+++ b/build_tools/cirrus/arm_tests.yml
@@ -17,4 +17,10 @@ linux_aarch64_test_task:
     folder: /root/.conda/pkgs
     fingerprint_script: cat build_tools/cirrus/py39_conda_forge_linux-aarch64_conda.lock
 
-  test_script: bash build_tools/cirrus/build_test_arm.sh
+  test_script: |
+    bash build_tools/cirrus/build_test_arm.sh
+    # On success, this script is run updating the issue.
+    bash build_tools/cirrus/update_tracking_issue.sh true
+
+  on_failure:
+    update_tracker_script: bash build_tools/cirrus/update_tracking_issue.sh false
diff --git a/build_tools/cirrus/arm_wheel.yml b/build_tools/cirrus/arm_wheel.yml
index a7023867e1109..5616108315fba 100644
--- a/build_tools/cirrus/arm_wheel.yml
+++ b/build_tools/cirrus/arm_wheel.yml
@@ -16,12 +16,8 @@ macos_arm64_wheel_task:
     # See `maint_tools/update_tracking_issue.py` for details on the permissions the token requires.
     BOT_GITHUB_TOKEN: ENCRYPTED[9b50205e2693f9e4ce9a3f0fcb897a259289062fda2f5a3b8aaa6c56d839e0854a15872f894a70fca337dd4787274e0f]
   matrix:
-    - env:
-        CIBW_BUILD: cp38-macosx_arm64
-    - env:
-        CIBW_BUILD: cp39-macosx_arm64
-    - env:
-        CIBW_BUILD: cp310-macosx_arm64
+    # Only the latest Python version is built and tested on CirrusCI, the other
+    # macos arm64 builds are on GitHub Actions
     - env:
         CIBW_BUILD: cp311-macosx_arm64
 
@@ -60,12 +56,16 @@ linux_arm64_wheel_task:
     # See `maint_tools/update_tracking_issue.py` for details on the permissions the token requires.
     BOT_GITHUB_TOKEN: ENCRYPTED[9b50205e2693f9e4ce9a3f0fcb897a259289062fda2f5a3b8aaa6c56d839e0854a15872f894a70fca337dd4787274e0f]
   matrix:
+    # Only the latest Python version is tested
     - env:
         CIBW_BUILD: cp38-manylinux_aarch64
+        CIBW_TEST_SKIP: "*_aarch64"
     - env:
         CIBW_BUILD: cp39-manylinux_aarch64
+        CIBW_TEST_SKIP: "*_aarch64"
     - env:
         CIBW_BUILD: cp310-manylinux_aarch64
+        CIBW_TEST_SKIP: "*_aarch64"
     - env:
         CIBW_BUILD: cp311-manylinux_aarch64
 
diff --git a/build_tools/cirrus/build_test_arm.sh b/build_tools/cirrus/build_test_arm.sh
index 4eeef6ec2dc0c..dfe048da47a7f 100755
--- a/build_tools/cirrus/build_test_arm.sh
+++ b/build_tools/cirrus/build_test_arm.sh
@@ -25,7 +25,7 @@ setup_ccache() {
 MAMBAFORGE_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-aarch64.sh"
 
 # Install Mambaforge
-wget $MAMBAFORGE_URL -O mambaforge.sh
+curl -L $MAMBAFORGE_URL -o mambaforge.sh
 MAMBAFORGE_PATH=$HOME/mambaforge
 bash ./mambaforge.sh -b -p $MAMBAFORGE_PATH
 export PATH=$MAMBAFORGE_PATH/bin:$PATH
diff --git a/build_tools/update_environments_and_lock_files.py b/build_tools/update_environments_and_lock_files.py
index 4854cc7936aca..35c382bd7f5ab 100644
--- a/build_tools/update_environments_and_lock_files.py
+++ b/build_tools/update_environments_and_lock_files.py
@@ -556,15 +556,15 @@ def check_conda_version():
     # Avoid issues with glibc (https://github.com/conda/conda-lock/issues/292)
     # or osx (https://github.com/conda/conda-lock/issues/408) virtual package.
     # The glibc one has been fixed in conda 23.1.0 and the osx has been fixed
-    # in main and will be fixed when conda >= 23.6 is released.
+    # in conda 23.7.0.
     conda_info_output = execute_command(["conda", "info", "--json"])
 
     conda_info = json.loads(conda_info_output)
     conda_version = Version(conda_info["conda_version"])
 
-    if Version("22.9.0") < conda_version < Version("23.6"):
+    if Version("22.9.0") < conda_version < Version("23.7"):
         raise RuntimeError(
-            f"conda version should be <= 22.9.0 or >= 23.6 got: {conda_version}"
+            f"conda version should be <= 22.9.0 or >= 23.7 got: {conda_version}"
         )
 
 
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index fc1ef95dbced0..6aecc524a9a30 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -542,6 +542,7 @@ message, the following actions are taken.
     [pypy]                 Build & test with PyPy
     [pyodide]              Build & test with Pyodide
     [azure parallel]       Run Azure CI jobs in parallel
+    [cirrus arm]           Run Cirrus CI ARM test
     [float32]              Run float32 tests by setting `SKLEARN_RUN_FLOAT32_TESTS=1`. See :ref:`environment_variable` for more details
     [doc skip]             Docs are not built
     [doc quick]            Docs built, but excludes example gallery plots
diff --git a/doc/glossary.rst b/doc/glossary.rst
index 36afcd9483684..1dbb7e630c449 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -205,6 +205,29 @@ General Concepts
         exceptional behaviours on the estimator using semantic :term:`estimator
         tags`.
 
+    cross-fitting
+    cross fitting
+        A resampling method that iteratively partitions data into mutually
+        exclusive subsets to fit two stages. During the first stage, the
+        mutually exclusive subsets enable predictions or transformations to be
+        computed on data not seen during training. The computed data is then
+        used in the second stage. The objective is to avoid having any
+        overfitting in the first stage introduce bias into the input data
+        distribution of the second stage.
+        For examples of its use, see: :class:`~preprocessing.TargetEncoder`,
+        :class:`~ensemble.StackingClassifier`,
+        :class:`~ensemble.StackingRegressor` and
+        :class:`~calibration.CalibratedClassifierCV`.
+
+    cross-validation
+    cross validation
+        A resampling method that iteratively partitions data into mutually
+        exclusive 'train' and 'test' subsets so model performance can be
+        evaluated on unseen data. This conserves data as avoids the need to hold
+        out a 'validation' dataset and accounts for variability as multiple
+        rounds of cross validation are generally performed.
+        See :ref:`User Guide <cross_validation>` for more details.
+
     deprecation
         We use deprecation to slowly violate our :term:`backwards
         compatibility` assurances, usually to:
diff --git a/doc/install.rst b/doc/install.rst
index bf2832bf72f24..263e83cdc31a5 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -61,7 +61,7 @@ Installing the latest release
          ><span class="sk-expandable" data-packager="pip" data-os="linux">Install python3 and python3-pip using the package manager of the Linux Distribution.</span
          ><span class="sk-expandable" data-packager="conda"
             >Install conda using the <a href="https://docs.conda.io/projects/conda/en/latest/user-guide/install/">Anaconda or miniconda</a>
-             installers or the <a href="https://https://github.com/conda-forge/miniforge#miniforge">miniforge</a> installers
+             installers or the <a href="https://github.com/conda-forge/miniforge#miniforge">miniforge</a> installers
              (no administrator permission required for any of those).</span>
        </div>
 
@@ -279,14 +279,14 @@ and in the `main`, `conda-forge` and `intel` conda channels:
 
   conda install scikit-learn-intelex
 
-This package has an Intel optimized version of many estimators. Whenever 
-an alternative implementation doesn't exist, scikit-learn implementation 
-is used as a fallback. Those optimized solvers come from the oneDAL 
-C++ library and are optimized for the x86_64 architecture, and are 
+This package has an Intel optimized version of many estimators. Whenever
+an alternative implementation doesn't exist, scikit-learn implementation
+is used as a fallback. Those optimized solvers come from the oneDAL
+C++ library and are optimized for the x86_64 architecture, and are
 optimized for multi-core Intel CPUs.
 
 Note that those solvers are not enabled by default, please refer to the
-`scikit-learn-intelex <https://intel.github.io/scikit-learn-intelex/what-is-patching.html>`_ 
+`scikit-learn-intelex <https://intel.github.io/scikit-learn-intelex/what-is-patching.html>`_
 documentation for more details on usage scenarios. Direct export example:
 
 .. prompt:: bash $
diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst
index 635395fd07c43..741ebbf240a6d 100644
--- a/doc/modules/array_api.rst
+++ b/doc/modules/array_api.rst
@@ -83,17 +83,26 @@ the tensors directly::
     >>> X_trans.device.type
     'cuda'
 
-.. _array_api_estimators:
+.. _array_api_supported:
 
-Estimators with support for `Array API`-compatible inputs
-=========================================================
+Support for `Array API`-compatible inputs
+=========================================
+
+Estimators and other tools in scikit-learn that support Array API compatible inputs.
+
+Estimators
+----------
 
 - :class:`decomposition.PCA` (with `svd_solver="full"`,
   `svd_solver="randomized"` and `power_iteration_normalizer="QR"`)
 - :class:`discriminant_analysis.LinearDiscriminantAnalysis` (with `solver="svd"`)
 
-Coverage for more estimators is expected to grow over time. Please follow the
-dedicated `meta-issue on GitHub
+Tools
+-----
+
+- :func:`model_selection.train_test_split`
+
+Coverage is expected to grow over time. Please follow the dedicated `meta-issue on GitHub
 <https://github.com/scikit-learn/scikit-learn/issues/22352>`_ to track progress.
 
 Common estimator checks
diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
index faba9a76ab94c..f277c32675c3f 100644
--- a/doc/modules/compose.rst
+++ b/doc/modules/compose.rst
@@ -66,10 +66,8 @@ it takes a variable number of estimators and returns a pipeline,
 filling in the names automatically::
 
     >>> from sklearn.pipeline import make_pipeline
-    >>> from sklearn.naive_bayes import MultinomialNB
-    >>> from sklearn.preprocessing import Binarizer
-    >>> make_pipeline(Binarizer(), MultinomialNB())
-    Pipeline(steps=[('binarizer', Binarizer()), ('multinomialnb', MultinomialNB())])
+    >>> make_pipeline(PCA(), SVC())
+    Pipeline(steps=[('pca', PCA()), ('svc', SVC())])
 
 Accessing steps
 ...............
diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index 6158e000cb727..8afa467982736 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -102,6 +102,7 @@ where the number of samples is very small.
 .. image:: ../images/grid_search_cross_validation.png
    :width: 500px
    :height: 300px
+   :alt: A depiction of a 5 fold cross validation on a training set, while holding out a test set.
    :align: center
 
 Computing cross-validated metrics
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index c3ea63bc6e944..36eed98da0f6b 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -10,12 +10,12 @@ Ensembles: Gradient boosting, random forests, bagging, voting, stacking
 base estimators built with a given learning algorithm in order to improve
 generalizability / robustness over a single estimator.
 
-Two very famous examples of ensemble methods are `gradient-boosted trees
-<gradient_boosting>`_ and `random forests <forest>`_.
+Two very famous examples of ensemble methods are :ref:`gradient-boosted trees
+<gradient_boosting>` and :ref:`random forests <forest>`.
 
 More generally, ensemble models can be applied to any base learner beyond
 trees, in averaging methods such as :ref:`Bagging methods <bagging>`,
-`model stacking <stacking>`_, or `Voting <voting_classifier>`_, or in
+:ref:`model stacking <stacking>`, or :ref:`Voting <voting_classifier>`, or in
 boosting, as :ref:`AdaBoost <adaboost>`.
 
 .. contents::
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 43356763d69c3..aa9184a2bedc5 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -37,7 +37,7 @@ solves a problem of the form:
    :align: center
    :scale: 50%
 
-:class:`LinearRegression` will take in its ``fit`` method arrays X, y
+:class:`LinearRegression` will take in its ``fit`` method arrays ``X``, ``y``
 and will store the coefficients :math:`w` of the linear model in its
 ``coef_`` member::
 
@@ -114,7 +114,7 @@ of shrinkage and thus the coefficients become more robust to collinearity.
 
 
 As with other linear models, :class:`Ridge` will take in its ``fit`` method
-arrays X, y and will store the coefficients :math:`w` of the linear model in
+arrays ``X``, ``y`` and will store the coefficients :math:`w` of the linear model in
 its ``coef_`` member::
 
     >>> from sklearn import linear_model
@@ -889,12 +889,16 @@ the probability of the positive class :math:`P(y_i=1|X_i)` as
 
 .. math:: \hat{p}(X_i) = \operatorname{expit}(X_i w + w_0) = \frac{1}{1 + \exp(-X_i w - w_0)}.
 
+
 As an optimization problem, binary
 class logistic regression with regularization term :math:`r(w)` minimizes the
 following cost function:
 
-.. math:: \min_{w} C \sum_{i=1}^n \left(-y_i \log(\hat{p}(X_i)) - (1 - y_i) \log(1 - \hat{p}(X_i))\right) + r(w).
-
+.. math::
+    :name: regularized-logistic-loss
+   
+    \min_{w} C \sum_{i=1}^n \left(-y_i \log(\hat{p}(X_i)) - (1 - y_i) \log(1 - \hat{p}(X_i))\right) + r(w).
+   
 
 We currently provide four choices for the regularization term  :math:`r(w)`  via
 the `penalty` argument:
diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index d11287e7c29b1..d3a7df74e6348 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -188,13 +188,9 @@ distance can be supplied to compute the weights.
 
 .. |classification_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_classification_001.png
    :target: ../auto_examples/neighbors/plot_classification.html
-   :scale: 50
-
-.. |classification_2| image:: ../auto_examples/neighbors/images/sphx_glr_plot_classification_002.png
-   :target: ../auto_examples/neighbors/plot_classification.html
-   :scale: 50
+   :scale: 75
 
-.. centered:: |classification_1| |classification_2|
+.. centered:: |classification_1|
 
 .. topic:: Examples:
 
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 1d7ad07f7023c..82fecf0c4e9f1 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -910,16 +910,16 @@ For continuous targets, the formulation is similar to binary classification:
 where :math:`L_i` is the set of observations with category :math:`i` and
 :math:`n_i` is the number of observations with category :math:`i`.
 
-:meth:`~TargetEncoder.fit_transform` internally relies on a cross fitting
+:meth:`~TargetEncoder.fit_transform` internally relies on a :term:`cross fitting`
 scheme to prevent target information from leaking into the train-time
 representation, especially for non-informative high-cardinality categorical
 variables, and help prevent the downstream model from overfitting spurious
 correlations. Note that as a result, `fit(X, y).transform(X)` does not equal
 `fit_transform(X, y)`. In :meth:`~TargetEncoder.fit_transform`, the training
-data is split into *k* folds (determined by the `cv` parameter) and encodes each
-fold using the encodings trained on the other *k-1* folds. The following diagram
-shows the cross fitting scheme in :meth:`~TargetEncoder.fit_transform` with
-the default `cv=5`:
+data is split into *k* folds (determined by the `cv` parameter) and each fold is
+encoded using the encodings learnt using the other *k-1* folds. The following
+diagram shows the :term:`cross fitting` scheme in
+:meth:`~TargetEncoder.fit_transform` with the default `cv=5`:
 
 .. image:: ../images/target_encoder_cross_validation.svg
    :width: 600
@@ -929,10 +929,10 @@ the default `cv=5`:
 the whole training set. This is never used in
 :meth:`~TargetEncoder.fit_transform` but is saved to the attribute `encodings_`,
 for use when :meth:`~TargetEncoder.transform` is called. Note that the encodings
-learned for each fold during the cross fitting scheme are not saved to an
-attribute.
+learned for each fold during the :term:`cross fitting` scheme are not saved to
+an attribute.
 
-The :meth:`~TargetEncoder.fit` method does **not** use any cross fitting
+The :meth:`~TargetEncoder.fit` method does **not** use any :term:`cross fitting`
 schemes and learns one encoding on the entire training set, which is used to
 encode categories in :meth:`~TargetEncoder.transform`.
 This encoding is the same as the 'full data'
diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst
index 7e886366aebae..0ac34cdcb6a10 100644
--- a/doc/modules/svm.rst
+++ b/doc/modules/svm.rst
@@ -60,14 +60,19 @@ capable of performing binary and multi-class classification on a dataset.
    :align: center
 
 
-:class:`SVC` and :class:`NuSVC` are similar methods, but accept
-slightly different sets of parameters and have different mathematical
-formulations (see section :ref:`svm_mathematical_formulation`). On the
-other hand, :class:`LinearSVC` is another (faster) implementation of Support
-Vector Classification for the case of a linear kernel. Note that
-:class:`LinearSVC` does not accept parameter ``kernel``, as this is
-assumed to be linear. It also lacks some of the attributes of
-:class:`SVC` and :class:`NuSVC`, like ``support_``.
+:class:`SVC` and :class:`NuSVC` are similar methods, but accept slightly
+different sets of parameters and have different mathematical formulations (see
+section :ref:`svm_mathematical_formulation`). On the other hand,
+:class:`LinearSVC` is another (faster) implementation of Support Vector
+Classification for the case of a linear kernel. It also
+lacks some of the attributes of :class:`SVC` and :class:`NuSVC`, like
+`support_`. :class:`LinearSVC` uses `squared_hinge` loss and due to its
+implementation in `liblinear` it also regularizes the intercept, if considered.
+This effect can however be reduced by carefully fine tuning its
+`intercept_scaling` parameter, which allows the intercept term to have a
+different regularization behavior compared to the other features. The
+classification results and score can therefore differ from the other two
+classifiers.
 
 As other classifiers, :class:`SVC`, :class:`NuSVC` and
 :class:`LinearSVC` take as input two arrays: an array `X` of shape
@@ -314,10 +319,15 @@ target.
 
 There are three different implementations of Support Vector Regression:
 :class:`SVR`, :class:`NuSVR` and :class:`LinearSVR`. :class:`LinearSVR`
-provides a faster implementation than :class:`SVR` but only considers
-the linear kernel, while :class:`NuSVR` implements a slightly different
-formulation than :class:`SVR` and :class:`LinearSVR`. See
-:ref:`svm_implementation_details` for further details.
+provides a faster implementation than :class:`SVR` but only considers the
+linear kernel, while :class:`NuSVR` implements a slightly different formulation
+than :class:`SVR` and :class:`LinearSVR`. Due to its implementation in
+`liblinear` :class:`LinearSVR` also regularizes the intercept, if considered.
+This effect can however be reduced by carefully fine tuning its
+`intercept_scaling` parameter, which allows the intercept term to have a
+different regularization behavior compared to the other features. The
+classification results and score can therefore differ from the other two
+classifiers. See :ref:`svm_implementation_details` for further details.
 
 As with classification classes, the fit method will take as
 argument vectors X, y, only that in this case y is expected to have
diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst
index ae82af1366966..7ae039e64a49a 100644
--- a/doc/modules/tree.rst
+++ b/doc/modules/tree.rst
@@ -27,8 +27,8 @@ Some advantages of decision trees are:
 
     - Requires little data preparation. Other techniques often require data
       normalization, dummy variables need to be created and blank values to
-      be removed. Note however that this module does not support missing
-      values.
+      be removed. Some tree and algorithm combinations support 
+      :ref:`missing values <tree_missing_value_support>`.
 
     - The cost of using the tree (i.e., predicting data) is logarithmic in the
       number of data points used to train the tree.
diff --git a/doc/related_projects.rst b/doc/related_projects.rst
index 9cc70ad89ffff..10304a7070be0 100644
--- a/doc/related_projects.rst
+++ b/doc/related_projects.rst
@@ -21,9 +21,6 @@ enhance the functionality of scikit-learn's estimators.
 
 **Data formats**
 
-- `Fast svmlight / libsvm file loader <https://github.com/mblondel/svmlight-loader>`_
-  Fast and memory-efficient svmlight / libsvm file loader for Python.
-
 - `sklearn_pandas <https://github.com/paulgb/sklearn-pandas/>`_ bridge for
   scikit-learn pipelines and pandas data frame with dedicated transformers.
 
@@ -64,19 +61,20 @@ enhance the functionality of scikit-learn's estimators.
   It incorporates multiple modeling libraries under one API, and
   the objects that EvalML creates use an sklearn-compatible API.
 
-**Experimentation frameworks**
+**Experimentation and model registry frameworks**
+
+- `MLFlow <https://mlflow.org/>`_ MLflow is an open source platform to manage the ML
+  lifecycle, including experimentation, reproducibility, deployment, and a central
+  model registry.
 
 - `Neptune <https://neptune.ai/>`_ Metadata store for MLOps,
-  built for teams that run a lot of experiments.‌ It gives you a single
+  built for teams that run a lot of experiments. It gives you a single
   place to log, store, display, organize, compare, and query all your
   model building metadata.
 
 - `Sacred <https://github.com/IDSIA/Sacred>`_ Tool to help you configure,
   organize, log and reproduce experiments
 
-- `REP <https://github.com/yandex/REP>`_ Environment for conducting data-driven
-  research in a consistent and reproducible way
-
 - `Scikit-Learn Laboratory
   <https://skll.readthedocs.io/en/latest/index.html>`_  A command-line
   wrapper around scikit-learn that makes it easy to run machine learning
@@ -91,10 +89,7 @@ enhance the functionality of scikit-learn's estimators.
   debugging/inspecting machine learning models and explaining their
   predictions.
 
-- `mlxtend <https://github.com/rasbt/mlxtend>`_ Includes model visualization
-  utilities.
-
-- `sklearn-evaluation <https://github.com/ploomber/sklearn-evaluation>`_ 
+- `sklearn-evaluation <https://github.com/ploomber/sklearn-evaluation>`_
   Machine learning model evaluation made easy: plots, tables, HTML reports,
   experiment tracking and Jupyter notebook analysis. Visual analysis, model
   selection, evaluation and diagnostics.
@@ -140,7 +135,15 @@ enhance the functionality of scikit-learn's estimators.
 - `treelite <https://treelite.readthedocs.io>`_
   Compiles tree-based ensemble models into C code for minimizing prediction
   latency.
-  
+
+- `micromlgen <https://github.com/eloquentarduino/micromlgen>`_
+  MicroML brings Machine Learning algorithms to microcontrollers.
+  Supports several scikit-learn classifiers by transpiling them to C code.
+
+- `emlearn <https://emlearn.org>`_
+  Implements scikit-learn estimators in C99 for embedded devices and microcontrollers.
+  Supports several classifier, regression and outlier detection models.
+
 **Model throughput**
 
 - `Intel(R) Extension for scikit-learn <https://github.com/intel/scikit-learn-intelex>`_
@@ -161,12 +164,40 @@ project. The following are projects providing interfaces similar to
 scikit-learn for additional learning algorithms, infrastructures
 and tasks.
 
-**Structured learning**
+**Time series and forecasting**
+
+- `Darts <https://unit8co.github.io/darts/>`_ Darts is a Python library for
+  user-friendly forecasting and anomaly detection on time series. It contains a variety
+  of models, from classics such as ARIMA to deep neural networks. The forecasting
+  models can all be used in the same way, using fit() and predict() functions, similar
+  to scikit-learn.
+
+- `sktime <https://github.com/alan-turing-institute/sktime>`_ A scikit-learn compatible
+  toolbox for machine learning with time series including time series
+  classification/regression and (supervised/panel) forecasting.
+
+- `skforecast <https://github.com/JoaquinAmatRodrigo/skforecast>`_ A python library
+  that eases using scikit-learn regressors as multi-step forecasters. It also works
+  with any regressor compatible with the scikit-learn API.
+
+- `tslearn <https://github.com/tslearn-team/tslearn>`_ A machine learning library for
+  time series that offers tools for pre-processing and feature extraction as well as
+  dedicated models for clustering, classification and regression.
 
-- `tslearn <https://github.com/tslearn-team/tslearn>`_ A machine learning library for time series
-  that offers tools for pre-processing and feature extraction as well as dedicated models for clustering, classification and regression.
+**Gradient (tree) boosting**
 
-- `sktime <https://github.com/alan-turing-institute/sktime>`_ A scikit-learn compatible toolbox for machine learning with time series including time series classification/regression and (supervised/panel) forecasting.
+Note scikit-learn own modern gradient boosting estimators
+:class:`~sklearn.ensemble.HistGradientBoostingClassifier` and
+:class:`~sklearn.ensemble.HistGradientBoostingRegressor`.
+
+- `XGBoost <https://github.com/dmlc/xgboost>`_ XGBoost is an optimized distributed
+  gradient boosting library designed to be highly efficient, flexible and portable.
+
+- `LightGBM <https://lightgbm.readthedocs.io>`_ LightGBM is a gradient boosting
+  framework that uses tree based learning algorithms. It is designed to be distributed
+  and efficient.
+
+**Structured learning**
 
 - `HMMLearn <https://github.com/hmmlearn/hmmlearn>`_ Implementation of hidden
   markov models that was previously part of scikit-learn.
@@ -182,21 +213,9 @@ and tasks.
   (`CRFsuite <http://www.chokkan.org/software/crfsuite/>`_ wrapper with
   sklearn-like API).
 
-- `skforecast <https://github.com/JoaquinAmatRodrigo/skforecast>`_ A python library
-  that eases using scikit-learn regressors as multi-step forecasters. It also works
-  with any regressor compatible with the scikit-learn API.
 
 **Deep neural networks etc.**
 
-- `nolearn <https://github.com/dnouri/nolearn>`_ A number of wrappers and
-  abstractions around existing neural network libraries
-
-- `Keras <https://www.tensorflow.org/api_docs/python/tf/keras>`_ High-level API for
-  TensorFlow with a scikit-learn inspired API.
-
-- `lasagne <https://github.com/Lasagne/Lasagne>`_ A lightweight library to
-  build and train neural networks in Theano.
-
 - `skorch <https://github.com/dnouri/skorch>`_ A scikit-learn compatible
   neural network library that wraps PyTorch.
 
@@ -219,9 +238,6 @@ and tasks.
 
 **Other regression and classification**
 
-- `xgboost <https://github.com/dmlc/xgboost>`_ Optimised gradient boosted decision
-  tree library.
-
 - `ML-Ensemble <https://mlens.readthedocs.io/>`_ Generalized
   ensemble learning (stacking, blending, subsemble, deep ensembles,
   etc.).
@@ -232,10 +248,6 @@ and tasks.
 - `py-earth <https://github.com/scikit-learn-contrib/py-earth>`_ Multivariate
   adaptive regression splines
 
-- `Kernel Regression <https://github.com/jmetzen/kernel_regression>`_
-  Implementation of Nadaraya-Watson kernel regression with automatic bandwidth
-  selection
-
 - `gplearn <https://github.com/trevorstephens/gplearn>`_ Genetic Programming
   for symbolic regression tasks.
 
@@ -245,8 +257,6 @@ and tasks.
 - `seglearn <https://github.com/dmbee/seglearn>`_ Time series and sequence
   learning using sliding window segmentation.
 
-- `libOPF <https://github.com/jppbsi/LibOPF>`_ Optimal path forest classifier
-
 - `fastFM <https://github.com/ibayer/fastFM>`_ Fast factorization machine
   implementation compatible with scikit-learn
 
@@ -266,6 +276,7 @@ and tasks.
 
 - `hdbscan <https://github.com/scikit-learn-contrib/hdbscan>`_ HDBSCAN and Robust Single
   Linkage clustering algorithms for robust variable density clustering.
+  As of scikit-learn version 1.3.0, there is :class:`~sklearn.cluster.HDBSCAN`.
 
 - `spherecluster <https://github.com/clara-labs/spherecluster>`_ Spherical
   K-means and mixture of von Mises Fisher clustering routines for data on the
@@ -276,6 +287,8 @@ and tasks.
 - `categorical-encoding
   <https://github.com/scikit-learn-contrib/categorical-encoding>`_ A
   library of sklearn compatible categorical variable encoders.
+  As of scikit-learn version 1.3.0, there is
+  :class:`~sklearn.preprocessing.TargetEncoder`.
 
 - `imbalanced-learn
   <https://github.com/scikit-learn-contrib/imbalanced-learn>`_ Various
@@ -331,9 +344,6 @@ Recommendation Engine packages
 - `OpenRec <https://github.com/ylongqi/openrec>`_ TensorFlow-based
   neural-network inspired recommendation algorithms.
 
-- `Spotlight <https://github.com/maciejkula/spotlight>`_ Pytorch-based
-  implementation of deep recommender models.
-
 - `Surprise Lib <https://surpriselib.com/>`_ Library for explicit feedback
   datasets.
 
@@ -355,9 +365,6 @@ Domain specific packages
 
 - `AstroML <https://www.astroml.org/>`_  Machine learning for astronomy.
 
-- `MSMBuilder <http://msmbuilder.org/>`_  Machine learning for protein
-  conformational dynamics time series.
-
 Translations of scikit-learn documentation
 ------------------------------------------
 
diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css
index 0a8822cdcd848..40ac5e25ea698 100644
--- a/doc/themes/scikit-learn-modern/static/css/theme.css
+++ b/doc/themes/scikit-learn-modern/static/css/theme.css
@@ -661,13 +661,19 @@ div.sk-sidebar-global-toc ul ul {
 div.sk-page-content h1 {
   background-color: #cde8ef;
   padding: 0.5rem;
-  margin-top: calc(max(2.5rem, 1vh));
+  margin-top: calc(max(1rem, 1vh));
   border-radius: 0 1rem;
   text-align: center;
   font-size: 2rem;
   word-wrap: break-word;
 }
 
+/* General sibling selector: does not apply to first h1, to avoid gap in
+ * top of page */
+div.sk-page-content ~ h1 {
+    margin-top: calc(max(2.5rem, 1vh));
+}
+
 div.sk-page-content h2 {
   padding: 0.5rem;
   background-color: #BED4EB;
diff --git a/doc/tutorial/machine_learning_map/pyparsing.py b/doc/tutorial/machine_learning_map/pyparsing.py
index 0418cf2b51528..88d00e138d02c 100644
--- a/doc/tutorial/machine_learning_map/pyparsing.py
+++ b/doc/tutorial/machine_learning_map/pyparsing.py
@@ -21,7 +21,7 @@
 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #
-# flake8: noqa
+# ruff: noqa
 
 __doc__ = \
 """
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index fea27b0c1c1a4..da2f5e8796db8 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -392,7 +392,7 @@ Changelog
 
 - |Efficiency| :class:`decomposition.NMF` with `solver="mu"` fitted on sparse input
   matrices now uses batching to avoid briefly allocating an array with size
-  (#non-zero elements, n_components). :pr:`15257` by `Mart Willocx <Maocx>`_.
+  (#non-zero elements, n_components). :pr:`15257` by :user:`Mart Willocx <Maocx>`.
 
 - |Enhancement| :func:`decomposition.dict_learning` and
   :func:`decomposition.dict_learning_online` now accept `method_max_iter` and
diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 8d39ca2fed143..dc955f7aa0f51 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -9,9 +9,22 @@ Version 1.3.1
 
 **In development**
 
+Changes impacting all modules
+-----------------------------
+
+- |Fix| The `set_output` API correctly works with list input. :pr:`27044` by
+  `Thomas Fan`_.
+
 Changelog
 ---------
 
+:mod:`sklearn.impute`
+.....................
+
+- |Fix| :class:`impute.KNNImputer` now correctly adds a missing indicator column in
+  ``transform`` when ``add_indicator`` is set to ``True`` and missing values are observed
+  during ``fit``. :pr:`26600` by :user:`Shreesha Kumar Bhat <Shreesha3112>`.
+
 :mod:`sklearn.neighbors`
 ........................
 
@@ -23,6 +36,22 @@ Changelog
   :attr:`sklearn.neighbors.KDTree.valid_metrics` as public class attributes.
   :pr:`26754` by :user:`Julien Jerphanion <jjerphan>`.
 
+- |Fix| :class:`sklearn.model_selection.HalvingRandomSearchCV` no longer raises
+  when the input to the `param_distributions` parameter is a list of dicts.
+  :pr:`26893` by :user:`Stefanie Senger <StefanieSenger>`.
+
+:mod:`sklearn.preprocessing`
+............................
+
+- |Fix| :class:`preprocessing.LabelEncoder` correctly accepts `y` as a keyword
+  argument. :pr:`26940` by `Thomas Fan`_.
+
+:mod:`sklearn.tree`
+...................
+
+- |Fix| :func:`tree.plot_tree` now accepts `class_names=True` as documented.
+  :pr:`26903` by :user:`Thomas Roehr <2maz>`
+
 .. _changes_1_3:
 
 Version 1.3.0
@@ -596,6 +625,13 @@ Changelog
   `n_targets`, which is used to decide the number of outputs when sampling
   from the prior distributions. :pr:`23099` by :user:`Zhehao Liu <MaxwellLZH>`.
 
+:mod:`sklearn.mixture`
+......................
+
+- |Efficiency| :class:`GaussianMixture` is more efficient now and will bypass unnecessary
+  initialization if the weights, means, and precisions are given by users.
+  :pr:`26021` by :user:`Jiawei Zhang <jiawei-zhang-a>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index c2b7d19404af9..e168f1d667607 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -19,6 +19,11 @@ parameters, may produce different models from the previous version. This often
 occurs due to changes in the modelling logic (bug fixes or enhancements), or in
 random sampling procedures.
 
+- |Fix| The initialization of :class:`mixture.GaussianMixture` from user-provided
+  `precisions_init` for `covariance_type` of `full` or `tied` was not correct,
+  and has been fixed.
+  :pr:`26416` by :user:`Yang Tao <mchikyt3>`.
+
 Changes impacting all modules
 -----------------------------
 
@@ -61,6 +66,27 @@ Changelog
 - |Enhancement| :func:`base.clone` now supports `dict` as input and creates a
   copy. :pr:`26786` by `Adrin Jalali`_.
 
+- |API|:func:`~utils.metadata_routing.process_routing` now has a different
+  signature. The first two (the object and the method) are positional only,
+  and all metadata are passed as keyword arguments. :pr:`26909` by `Adrin
+  Jalali`_.
+
+:mod:`sklearn.cluster`
+............................
+
+- |API| : `kdtree` and `balltree` values are now deprecated and are renamed as
+  `kd_tree` and `ball_tree` respectively for the `algorithm` parameter of
+  :class:`cluster.HDBSCAN` ensuring consistency in naming convention.
+  `kdtree` and `balltree` values will be removed in 1.6.
+  :pr:`26744` by :user:`Shreesha Kumar Bhat <Shreesha3112>`.
+
+:mod:`sklearn.cross_decomposition`
+..................................
+
+- |Fix| :class:`cross_decomposition.PLSRegression` now automatically ravels the output
+  of `predict` if fitted with one dimensional `y`.
+  :pr:`26602` by :user:`Yao Xiao <Charlie-XIAO>`.
+
 :mod:`sklearn.decomposition`
 ............................
 
@@ -80,6 +106,12 @@ Changelog
 :mod:`sklearn.ensemble`
 .......................
 
+- |MajorFeature| :class:`ensemble.RandomForestClassifier` and
+  :class:`ensemble.RandomForestRegressor` support missing values when
+  the criterion is `gini`, `entropy`, or `log_loss`,
+  for classification or `squared_error`, `friedman_mse`, or `poisson`
+  for regression. :pr:`26391` by `Thomas Fan`_.
+
 - |Feature| :class:`ensemble.RandomForestClassifier`,
   :class:`ensemble.RandomForestRegressor`, :class:`ensemble.ExtraTreesClassifier`
   and :class:`ensemble.ExtraTreesRegressor` now support monotonic constraints,
@@ -88,6 +120,11 @@ Changelog
   :pr:`13649` by :user:`Samuel Ronsin <samronsin>`,
   initiated by :user:`Patrick O'Reilly <pat-oreilly>`.
 
+- |Efficiency| Improves runtime and memory usage for
+  :class:`ensemble.GradientBoostingClassifier` and
+  :class:`ensemble.GradientBoostingRegressor` when trained on sparse data.
+  :pr:`26957` by `Thomas Fan`_.
+
 :mod:`sklearn.feature_selection`
 ................................
 
@@ -120,6 +157,37 @@ Changelog
   object in the parameter grid if it's an estimator. :pr:`26786` by `Adrin
   Jalali`_.
 
+- |Feature| :func:`~model_selection.cross_validate`,
+  :func:`~model_selection.cross_val_score`, and
+  :func:`~model_selection.cross_val_predict` now support metadata routing. The
+  metadata are routed to the estimator's `fit`, the scorer, and the CV
+  splitter's `split`. The metadata is accepted via the new `params` parameter.
+  `fit_params` is deprecated and will be removed in version 1.6. `groups`
+  parameter is also not accepted as a separate argument when metadata routing
+  is enabled and should be passed via the `params` parameter. :pr:`26896` by
+  `Adrin Jalali`_.
+
+:mod:`sklearn.neighbors`
+........................
+
+- |Fix| Neighbors based estimators now correctly work when `metric="minkowski"` and the
+  metric parameter `p` is in the range `0 < p < 1`, regardless of the `dtype` of `X`.
+  :pr:`26760` by :user:`Shreesha Kumar Bhat <Shreesha3112>`.
+
+:mod:`sklearn.preprocessing`
+............................
+
+- |Efficiency| :class:`preprocessing.OrdinalEncoder` avoids calculating
+  missing indices twice to improve efficiency.
+  :pr:`27017` by :user:`Xuefeng Xu <xuefeng-xu>`.
+
+- |Fix| :class:`preprocessing.OneHotEncoder` shows a more informative error message
+  when `sparse_output=True` and the output is configured to be pandas.
+  :pr:`26931` by `Thomas Fan`_.
+
+- |Enhancement| :func:`sklearn.model_selection.train_test_split` now supports
+  Array API compatible inputs. :pr:`26855` by `Tim Head`_.
+
 :mod:`sklearn.tree`
 ...................
 
@@ -131,9 +199,37 @@ Changelog
   :pr:`13649` by :user:`Samuel Ronsin <samronsin>`, initiated by
   :user:`Patrick O'Reilly <pat-oreilly>`.
 
+
+:mod:`sklearn.neighbors`
+........................
+
+- |API| :class:`neighbors.KNeighborsRegressor` now accepts
+  :class:`metric.DistanceMetric` objects directly via the `metric` keyword
+  argument allowing for the use of accelerated third-party
+  :class:`metric.DistanceMetric` objects.
+  :pr:`26267` by :user:`Meekail Zain <micky774>`
+
+:mod:`sklearn.metrics`
+......................
+
+- |Efficiency| Computing pairwise distances via :class:`metrics.DistanceMetric`
+  for CSR × CSR,  Dense × CSR, and CSR × Dense datasets is now 1.5x faster.
+  :pr:`26765` by :user:`Meekail Zain <micky774>`
+
+- |Efficiency| Computing distances via :class:`metrics.DistanceMetric`
+  for CSR × CSR, Dense × CSR, and CSR × Dense now uses ~50% less memory,
+  and outputs distances in the same dtype as the provided data.
+  :pr:`27006` by :user:`Meekail Zain <micky774>`
+
 :mod:`sklearn.utils`
 ....................
 
+- |Enhancement| :func:`sklearn.utils.estimator_html_repr` dynamically adapts
+  diagram colors based on the browser's `prefers-color-scheme`, providing
+  improved adaptability to dark mode environments.
+  :pr:`26862` by :user:`Andrew Goh Yisheng <9y5>`, `Thomas Fan`_, `Adrin
+  Jalali`_.
+
 - |Enhancement| :class:`~utils.metadata_routing.MetadataRequest` and
   :class:`~utils.metadata_routing.MetadataRouter` now have a ``consumes`` method
   which can be used to check whether a given set of parameters would be consumed.
diff --git a/examples/classification/plot_classifier_comparison.py b/examples/classification/plot_classifier_comparison.py
index 75164cff8b492..8d7eb7c63c81a 100644
--- a/examples/classification/plot_classifier_comparison.py
+++ b/examples/classification/plot_classifier_comparison.py
@@ -58,13 +58,15 @@
 
 classifiers = [
     KNeighborsClassifier(3),
-    SVC(kernel="linear", C=0.025),
-    SVC(gamma=2, C=1),
-    GaussianProcessClassifier(1.0 * RBF(1.0)),
-    DecisionTreeClassifier(max_depth=5),
-    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
-    MLPClassifier(alpha=1, max_iter=1000),
-    AdaBoostClassifier(),
+    SVC(kernel="linear", C=0.025, random_state=42),
+    SVC(gamma=2, C=1, random_state=42),
+    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
+    DecisionTreeClassifier(max_depth=5, random_state=42),
+    RandomForestClassifier(
+        max_depth=5, n_estimators=10, max_features=1, random_state=42
+    ),
+    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
+    AdaBoostClassifier(random_state=42),
     GaussianNB(),
     QuadraticDiscriminantAnalysis(),
 ]
diff --git a/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py b/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py
index 7eab9a3437d65..0dde24116065d 100644
--- a/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py
+++ b/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py
@@ -12,7 +12,7 @@
 trees according to each estimator:
 
 - `n_estimators` controls the number of trees in the forest. It's a fixed number.
-- `max_iter` is the the maximum number of iterations in a gradient boosting
+- `max_iter` is the maximum number of iterations in a gradient boosting
   based model. The number of iterations corresponds to the number of trees for
   regression and binary classification problems. Furthermore, the actual number
   of trees required by the model depends on the stopping criteria.
@@ -210,7 +210,7 @@
 # models uniformly dominate the Random Forest models in the "test score vs
 # training speed trade-off" (the HGBDT curve should be on the top left of the RF
 # curve, without ever crossing). The "test score vs prediction speed" trade-off
-# can also be more disputed but it's most often favorable to HGBDT. It's always
+# can also be more disputed, but it's most often favorable to HGBDT. It's always
 # a good idea to check both kinds of model (with hyper-parameter tuning) and
 # compare their performance on your specific problem to determine which model is
 # the best fit but **HGBT almost always offers a more favorable speed-accuracy
diff --git a/examples/feature_selection/plot_select_from_model_diabetes.py b/examples/feature_selection/plot_select_from_model_diabetes.py
index 688c2b4ba8079..f008d8d6e8b68 100644
--- a/examples/feature_selection/plot_select_from_model_diabetes.py
+++ b/examples/feature_selection/plot_select_from_model_diabetes.py
@@ -122,9 +122,6 @@
 print(f"Done in {toc_bwd - tic_bwd:.3f}s")
 
 # %%
-# Discussion
-# ----------
-#
 # Interestingly, forward and backward selection have selected the same set of
 # features. In general, this isn't the case and the two methods would lead to
 # different results.
@@ -145,3 +142,54 @@
 # attribute. The forward SFS is faster than the backward SFS because it only
 # needs to perform `n_features_to_select = 2` iterations, while the backward
 # SFS needs to perform `n_features - n_features_to_select = 8` iterations.
+#
+# Using negative tolerance values
+# -------------------------------
+#
+# :class:`~sklearn.feature_selection.SequentialFeatureSelector` can be used
+# to remove features present in the dataset and return a
+# smaller subset of the original features with `direction="backward"`
+# and a negative value of `tol`.
+#
+# We begin by loading the Breast Cancer dataset, consisting of 30 different
+# features and 569 samples.
+import numpy as np
+
+from sklearn.datasets import load_breast_cancer
+
+breast_cancer_data = load_breast_cancer()
+X, y = breast_cancer_data.data, breast_cancer_data.target
+feature_names = np.array(breast_cancer_data.feature_names)
+print(breast_cancer_data.DESCR)
+
+# %%
+# We will make use of the :class:`~sklearn.linear_model.LogisticRegression`
+# estimator with :class:`~sklearn.feature_selection.SequentialFeatureSelector`
+# to perform the feature selection.
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import roc_auc_score
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+
+for tol in [-1e-2, -1e-3, -1e-4]:
+    start = time()
+    feature_selector = SequentialFeatureSelector(
+        LogisticRegression(),
+        n_features_to_select="auto",
+        direction="backward",
+        scoring="roc_auc",
+        tol=tol,
+        n_jobs=2,
+    )
+    model = make_pipeline(StandardScaler(), feature_selector, LogisticRegression())
+    model.fit(X, y)
+    end = time()
+    print(f"\ntol: {tol}")
+    print(f"Features selected: {feature_names[model[1].get_support()]}")
+    print(f"ROC AUC score: {roc_auc_score(y, model.predict_proba(X)[:, 1]):.3f}")
+    print(f"Done in {end - start:.3f}s")
+
+# %%
+# We can see that the number of features selected tend to increase as negative
+# values of `tol` approach to zero. The time taken for feature selection also
+# decreases as the values of `tol` come closer to zero.
diff --git a/examples/miscellaneous/plot_metadata_routing.py b/examples/miscellaneous/plot_metadata_routing.py
index 350cd865d972e..9984bb6183348 100644
--- a/examples/miscellaneous/plot_metadata_routing.py
+++ b/examples/miscellaneous/plot_metadata_routing.py
@@ -447,7 +447,7 @@ def get_metadata_routing(self):
         return router
 
     def fit(self, X, y, **fit_params):
-        params = process_routing(self, "fit", fit_params)
+        params = process_routing(self, "fit", **fit_params)
 
         self.transformer_ = clone(self.transformer).fit(X, y, **params.transformer.fit)
         X_transformed = self.transformer_.transform(X, **params.transformer.transform)
@@ -458,7 +458,7 @@ def fit(self, X, y, **fit_params):
         return self
 
     def predict(self, X, **predict_params):
-        params = process_routing(self, "predict", predict_params)
+        params = process_routing(self, "predict", **predict_params)
 
         X_transformed = self.transformer_.transform(X, **params.transformer.transform)
         return self.classifier_.predict(X_transformed, **params.classifier.predict)
@@ -543,7 +543,7 @@ def __init__(self, estimator):
         self.estimator = estimator
 
     def fit(self, X, y, **fit_params):
-        params = process_routing(self, "fit", fit_params)
+        params = process_routing(self, "fit", **fit_params)
         self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit)
 
     def get_metadata_routing(self):
@@ -572,7 +572,7 @@ def __init__(self, estimator):
         self.estimator = estimator
 
     def fit(self, X, y, sample_weight=None, **fit_params):
-        params = process_routing(self, "fit", fit_params, sample_weight=sample_weight)
+        params = process_routing(self, "fit", sample_weight=sample_weight, **fit_params)
         check_metadata(self, sample_weight=sample_weight)
         self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit)
 
diff --git a/examples/neighbors/plot_classification.py b/examples/neighbors/plot_classification.py
index 4ed23862ae455..43c45558054cf 100644
--- a/examples/neighbors/plot_classification.py
+++ b/examples/neighbors/plot_classification.py
@@ -3,61 +3,92 @@
 Nearest Neighbors Classification
 ================================
 
-Sample usage of Nearest Neighbors classification.
-It will plot the decision boundaries for each class.
-
+This example shows how to use :class:`~sklearn.neighbors.KNeighborsClassifier`.
+We train such a classifier on the iris dataset and observe the difference of the
+decision boundary obtained with regards to the parameter `weights`.
 """
 
-import matplotlib.pyplot as plt
-import seaborn as sns
-from matplotlib.colors import ListedColormap
+# %%
+# Load the data
+# -------------
+#
+# In this example, we use the iris dataset. We split the data into a train and test
+# dataset.
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
 
-from sklearn import datasets, neighbors
-from sklearn.inspection import DecisionBoundaryDisplay
+iris = load_iris(as_frame=True)
+X = iris.data[["sepal length (cm)", "sepal width (cm)"]]
+y = iris.target
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
 
-n_neighbors = 15
+# %%
+# K-nearest neighbors classifier
+# ------------------------------
+#
+# We want to use a k-nearest neighbors classifier considering a neighborhood of 11 data
+# points. Since our k-nearest neighbors model uses euclidean distance to find the
+# nearest neighbors, it is therefore important to scale the data beforehand. Refer to
+# the example entitled
+# :ref:`sphx_glr_auto_examples_preprocessing_plot_scaling_importance.py` for more
+# detailed information.
+#
+# Thus, we use a :class:`~sklearn.pipeline.Pipeline` to chain a scaler before to use
+# our classifier.
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
 
-# import some data to play with
-iris = datasets.load_iris()
+clf = Pipeline(
+    steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=11))]
+)
 
-# we only take the first two features. We could avoid this ugly
-# slicing by using a two-dim dataset
-X = iris.data[:, :2]
-y = iris.target
+# %%
+# Decision boundary
+# -----------------
+#
+# Now, we fit two classifiers with different values of the parameter
+# `weights`. We plot the decision boundary of each classifier as well as the original
+# dataset to observe the difference.
+import matplotlib.pyplot as plt
 
-# Create color maps
-cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"])
-cmap_bold = ["darkorange", "c", "darkblue"]
+from sklearn.inspection import DecisionBoundaryDisplay
 
-for weights in ["uniform", "distance"]:
-    # we create an instance of Neighbours Classifier and fit the data.
-    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
-    clf.fit(X, y)
+_, axs = plt.subplots(ncols=2, figsize=(12, 5))
 
-    _, ax = plt.subplots()
-    DecisionBoundaryDisplay.from_estimator(
+for ax, weights in zip(axs, ("uniform", "distance")):
+    clf.set_params(knn__weights=weights).fit(X_train, y_train)
+    disp = DecisionBoundaryDisplay.from_estimator(
         clf,
-        X,
-        cmap=cmap_light,
-        ax=ax,
+        X_test,
         response_method="predict",
         plot_method="pcolormesh",
         xlabel=iris.feature_names[0],
         ylabel=iris.feature_names[1],
         shading="auto",
+        alpha=0.5,
+        ax=ax,
     )
-
-    # Plot also the training points
-    sns.scatterplot(
-        x=X[:, 0],
-        y=X[:, 1],
-        hue=iris.target_names[y],
-        palette=cmap_bold,
-        alpha=1.0,
-        edgecolor="black",
+    scatter = disp.ax_.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, edgecolors="k")
+    disp.ax_.legend(
+        scatter.legend_elements()[0],
+        iris.target_names,
+        loc="lower left",
+        title="Classes",
     )
-    plt.title(
-        "3-Class classification (k = %i, weights = '%s')" % (n_neighbors, weights)
+    _ = disp.ax_.set_title(
+        f"3-Class classification\n(k={clf[-1].n_neighbors}, weights={weights!r})"
     )
 
 plt.show()
+
+# %%
+# Conclusion
+# ----------
+#
+# We observe that the parameter `weights` has an impact on the decision boundary. When
+# `weights="unifom"` all nearest neighbors will have the same impact on the decision.
+# Whereas when `weights="distance"` the weight given to each neighbor is proportional
+# to the inverse of the distance from that neighbor to the query point.
+#
+# In some cases, taking the distance into account might improve the model.
diff --git a/examples/preprocessing/plot_target_encoder_cross_val.py b/examples/preprocessing/plot_target_encoder_cross_val.py
index f4ff643d8b48e..7244a1bf61cd6 100644
--- a/examples/preprocessing/plot_target_encoder_cross_val.py
+++ b/examples/preprocessing/plot_target_encoder_cross_val.py
@@ -6,21 +6,26 @@
 .. currentmodule:: sklearn.preprocessing
 
 The :class:`TargetEncoder` replaces each category of a categorical feature with
-the mean of the target variable for that category. This method is useful
+the shrunk mean of the target variable for that category. This method is useful
 in cases where there is a strong relationship between the categorical feature
 and the target. To prevent overfitting, :meth:`TargetEncoder.fit_transform` uses
-an internal cross fitting scheme to encode the training data to be used by a
-downstream model. In this example, we demonstrate the importance of the cross fitting
-procedure to prevent overfitting.
+an internal :term:`cross fitting` scheme to encode the training data to be used
+by a downstream model. This scheme involves splitting the data into *k* folds
+and encoding each fold using the encodings learnt using the other *k-1* folds.
+In this example, we demonstrate the importance of the cross
+fitting procedure to prevent overfitting.
 """
 
 # %%
 # Create Synthetic Dataset
 # ========================
-# For this example, we build a dataset with three categorical features: an informative
-# feature with medium cardinality, an uninformative feature with medium cardinality,
-# and an uninformative feature with high cardinality. First, we generate the informative
-# feature:
+# For this example, we build a dataset with three categorical features:
+#
+# * an informative feature with medium cardinality ("informative")
+# * an uninformative feature with medium cardinality ("shuffled")
+# * an uninformative feature with high cardinality ("near_unique")
+#
+# First, we generate the informative feature:
 import numpy as np
 
 from sklearn.preprocessing import KBinsDiscretizer
@@ -33,12 +38,16 @@
 n_categories = 100
 
 kbins = KBinsDiscretizer(
-    n_bins=n_categories, encode="ordinal", strategy="uniform", random_state=rng
+    n_bins=n_categories,
+    encode="ordinal",
+    strategy="uniform",
+    random_state=rng,
+    subsample=None,
 )
 X_informative = kbins.fit_transform((y + noise).reshape(-1, 1))
 
-# Remove the linear relationship between y and the bin index by permuting the values of
-# X_informative
+# Remove the linear relationship between y and the bin index by permuting the
+# values of X_informative:
 permuted_categories = rng.permutation(n_categories)
 X_informative = permuted_categories[X_informative.astype(np.int32)]
 
@@ -48,13 +57,13 @@
 X_shuffled = rng.permutation(X_informative)
 
 # %%
-# The uninformative feature with high cardinality is generated so that is independent of
-# the target variable. We will show that target encoding without cross fitting will
-# cause catastrophic overfitting for the downstream regressor. These high cardinality
-# features are basically unique identifiers for samples which should generally be
-# removed from machine learning dataset. In this example, we generate them to show how
-# :class:`TargetEncoder`'s default cross fitting behavior mitigates the overfitting
-# issue automatically.
+# The uninformative feature with high cardinality is generated so that it is
+# independent of the target variable. We will show that target encoding without
+# :term:`cross fitting` will cause catastrophic overfitting for the downstream
+# regressor. These high cardinality features are basically unique identifiers
+# for samples which should generally be removed from machine learning datasets.
+# In this example, we generate them to show how :class:`TargetEncoder`'s default
+# :term:`cross fitting` behavior mitigates the overfitting issue automatically.
 X_near_unique_categories = rng.choice(
     int(0.9 * n_samples), size=n_samples, replace=True
 ).reshape(-1, 1)
@@ -79,9 +88,10 @@
 # ==========================
 # In this section, we train a ridge regressor on the dataset with and without
 # encoding and explore the influence of target encoder with and without the
-# internal cross fitting. First, we see the Ridge model trained on the
-# raw features will have low performance, because the order of the informative
-# feature is not informative:
+# internal :term:`cross fitting`. First, we see the Ridge model trained on the
+# raw features will have low performance. This is because we permuted the order
+# of the informative feature meaning `X_informative` is not informative when
+# raw:
 import sklearn
 from sklearn.linear_model import Ridge
 
@@ -96,15 +106,15 @@
 
 # %%
 # Next, we create a pipeline with the target encoder and ridge model. The pipeline
-# uses :meth:`TargetEncoder.fit_transform` which uses cross fitting. We see that
-# the model fits the data well and generalizes to the test set:
+# uses :meth:`TargetEncoder.fit_transform` which uses :term:`cross fitting`. We
+# see that the model fits the data well and generalizes to the test set:
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import TargetEncoder
 
-model_with_cv = make_pipeline(TargetEncoder(random_state=0), ridge)
-model_with_cv.fit(X_train, y_train)
-print("Model with CV on training set: ", model_with_cv.score(X_train, y_train))
-print("Model with CV on test set: ", model_with_cv.score(X_test, y_test))
+model_with_cf = make_pipeline(TargetEncoder(random_state=0), ridge)
+model_with_cf.fit(X_train, y_train)
+print("Model with CF on train set: ", model_with_cf.score(X_train, y_train))
+print("Model with CF on test set: ", model_with_cf.score(X_test, y_test))
 
 # %%
 # The coefficients of the linear model shows that most of the weight is on the
@@ -114,49 +124,68 @@
 
 plt.rcParams["figure.constrained_layout.use"] = True
 
-coefs_cv = pd.Series(
-    model_with_cv[-1].coef_, index=model_with_cv[-1].feature_names_in_
+coefs_cf = pd.Series(
+    model_with_cf[-1].coef_, index=model_with_cf[-1].feature_names_in_
 ).sort_values()
-_ = coefs_cv.plot(kind="barh")
+ax = coefs_cf.plot(kind="barh")
+_ = ax.set(
+    title="Target encoded with cross fitting",
+    xlabel="Ridge coefficient",
+    ylabel="Feature",
+)
 
 # %%
-# While :meth:`TargetEncoder.fit_transform` uses an internal cross fitting scheme,
-# :meth:`TargetEncoder.transform` itself does not perform any cross fitting.
-# It uses the aggregation of the complete training set to transform the categorical
-# features. Thus, we can use :meth:`TargetEncoder.fit` followed by
-# :meth:`TargetEncoder.transform` to disable the cross fitting. This encoding
-# is then passed to the ridge model.
+# While :meth:`TargetEncoder.fit_transform` uses an internal
+# :term:`cross fitting` scheme to learn encodings for the training set,
+# :meth:`TargetEncoder.transform` itself does not.
+# It uses the complete training set to learn encodings and to transform the
+# categorical features. Thus, we can use :meth:`TargetEncoder.fit` followed by
+# :meth:`TargetEncoder.transform` to disable the :term:`cross fitting`. This
+# encoding is then passed to the ridge model.
 target_encoder = TargetEncoder(random_state=0)
 target_encoder.fit(X_train, y_train)
-X_train_no_cv_encoding = target_encoder.transform(X_train)
-X_test_no_cv_encoding = target_encoder.transform(X_test)
+X_train_no_cf_encoding = target_encoder.transform(X_train)
+X_test_no_cf_encoding = target_encoder.transform(X_test)
 
-model_no_cv = ridge.fit(X_train_no_cv_encoding, y_train)
+model_no_cf = ridge.fit(X_train_no_cf_encoding, y_train)
 
 # %%
-# We evaluate the model on the non-cross validated encoding and see that it overfits:
+# We evaluate the model that did not use :term:`cross fitting` when encoding and
+# see that it overfits:
 print(
-    "Model without CV on training set: ",
-    model_no_cv.score(X_train_no_cv_encoding, y_train),
+    "Model without CF on training set: ",
+    model_no_cf.score(X_train_no_cf_encoding, y_train),
 )
 print(
-    "Model without CV on test set: ", model_no_cv.score(X_test_no_cv_encoding, y_test)
+    "Model without CF on test set: ",
+    model_no_cf.score(
+        X_test_no_cf_encoding,
+        y_test,
+    ),
 )
 
 # %%
-# The ridge model overfits, because it assigns more weight to the extremely high
-# cardinality feature relative to the informative feature.
-coefs_no_cv = pd.Series(
-    model_no_cv.coef_, index=model_no_cv.feature_names_in_
+# The ridge model overfits because it assigns much more weight to the
+# uninformative extremely high cardinality ("near_unique") and medium
+# cardinality ("shuffled") features than when the model used
+# :term:`cross fitting` to encode the features.
+coefs_no_cf = pd.Series(
+    model_no_cf.coef_, index=model_no_cf.feature_names_in_
 ).sort_values()
-_ = coefs_no_cv.plot(kind="barh")
+ax = coefs_no_cf.plot(kind="barh")
+_ = ax.set(
+    title="Target encoded without cross fitting",
+    xlabel="Ridge coefficient",
+    ylabel="Feature",
+)
 
 # %%
 # Conclusion
 # ==========
-# This example demonstrates the importance of :class:`TargetEncoder`'s internal cross
-# fitting. It is important to use :meth:`TargetEncoder.fit_transform` to encode
-# training data before passing it to a machine learning model. When a
-# :class:`TargetEncoder` is a part of a :class:`~sklearn.pipeline.Pipeline` and the
-# pipeline is fitted, the pipeline will correctly call
-# :meth:`TargetEncoder.fit_transform` and pass the encoding along.
+# This example demonstrates the importance of :class:`TargetEncoder`'s internal
+# :term:`cross fitting`. It is important to use
+# :meth:`TargetEncoder.fit_transform` to encode training data before passing it
+# to a machine learning model. When a :class:`TargetEncoder` is a part of a
+# :class:`~sklearn.pipeline.Pipeline` and the pipeline is fitted, the pipeline
+# will correctly call :meth:`TargetEncoder.fit_transform` and use
+# :term:`cross fitting` when encoding the training data.
diff --git a/examples/release_highlights/plot_release_highlights_0_23_0.py b/examples/release_highlights/plot_release_highlights_0_23_0.py
index 7c6836632e3f0..d7ae7465a590b 100644
--- a/examples/release_highlights/plot_release_highlights_0_23_0.py
+++ b/examples/release_highlights/plot_release_highlights_0_23_0.py
@@ -1,4 +1,4 @@
-# flake8: noqa
+# ruff: noqa
 """
 ========================================
 Release Highlights for scikit-learn 0.23
diff --git a/examples/release_highlights/plot_release_highlights_0_24_0.py b/examples/release_highlights/plot_release_highlights_0_24_0.py
index a55b4aabc7994..29082c1a078f4 100644
--- a/examples/release_highlights/plot_release_highlights_0_24_0.py
+++ b/examples/release_highlights/plot_release_highlights_0_24_0.py
@@ -1,4 +1,4 @@
-# flake8: noqa
+# ruff: noqa
 """
 ========================================
 Release Highlights for scikit-learn 0.24
diff --git a/examples/release_highlights/plot_release_highlights_1_0_0.py b/examples/release_highlights/plot_release_highlights_1_0_0.py
index 383612e611688..7ac09dd193c0f 100644
--- a/examples/release_highlights/plot_release_highlights_1_0_0.py
+++ b/examples/release_highlights/plot_release_highlights_1_0_0.py
@@ -1,4 +1,4 @@
-# flake8: noqa
+# ruff: noqa
 """
 =======================================
 Release Highlights for scikit-learn 1.0
diff --git a/examples/release_highlights/plot_release_highlights_1_1_0.py b/examples/release_highlights/plot_release_highlights_1_1_0.py
index f6432cf15037c..b3058a7e0aa27 100644
--- a/examples/release_highlights/plot_release_highlights_1_1_0.py
+++ b/examples/release_highlights/plot_release_highlights_1_1_0.py
@@ -1,4 +1,4 @@
-# flake8: noqa
+# ruff: noqa
 """
 =======================================
 Release Highlights for scikit-learn 1.1
diff --git a/examples/release_highlights/plot_release_highlights_1_2_0.py b/examples/release_highlights/plot_release_highlights_1_2_0.py
index 8165c3bc4eed0..695e74cfcdd64 100644
--- a/examples/release_highlights/plot_release_highlights_1_2_0.py
+++ b/examples/release_highlights/plot_release_highlights_1_2_0.py
@@ -1,4 +1,4 @@
-# flake8: noqa
+# ruff: noqa
 """
 =======================================
 Release Highlights for scikit-learn 1.2
diff --git a/examples/release_highlights/plot_release_highlights_1_3_0.py b/examples/release_highlights/plot_release_highlights_1_3_0.py
index 8fa1ea057ac91..5ce2617cd08aa 100644
--- a/examples/release_highlights/plot_release_highlights_1_3_0.py
+++ b/examples/release_highlights/plot_release_highlights_1_3_0.py
@@ -1,4 +1,4 @@
-# flake8: noqa
+# ruff: noqa
 """
 =======================================
 Release Highlights for scikit-learn 1.3
diff --git a/pyproject.toml b/pyproject.toml
index efd72adf44392..c98ed2130189f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -83,7 +83,7 @@ exclude=[
 # + E501 (line too long) because keeping it < 88 in cython
 # often makes code less readable.
 ignore = [
-    # check ignored by default in flake8. Meaning unclear.
+    # multiple spaces/tab after comma
     'E24',
     # space before : (needed for how black formats slicing)
     'E203',
diff --git a/setup.cfg b/setup.cfg
index d91a27344c575..b7705781dbb7d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -20,7 +20,6 @@ addopts =
     # correctly on the CI when running `pytest --pyargs sklearn` from the
     # source folder.
     -p sklearn.tests.random_seed
-    -rN
 
 filterwarnings =
     ignore:the matrix subclass:PendingDeprecationWarning
@@ -54,6 +53,9 @@ ignore =
     sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
     sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
     sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
+    sklearn/neighbors/_ball_tree.pyx
+    sklearn/neighbors/_binary_tree.pxi
+    sklearn/neighbors/_kd_tree.pyx
 
 
 [codespell]
diff --git a/setup.py b/setup.py
index c41883aa5c37a..5c008944ec05c 100644
--- a/setup.py
+++ b/setup.py
@@ -306,8 +306,9 @@ def check_package_status(package, min_version):
         },
     ],
     "neighbors": [
-        {"sources": ["_ball_tree.pyx"], "include_np": True},
-        {"sources": ["_kd_tree.pyx"], "include_np": True},
+        {"sources": ["_binary_tree.pxi.tp"], "include_np": True},
+        {"sources": ["_ball_tree.pyx.tp"], "include_np": True},
+        {"sources": ["_kd_tree.pyx.tp"], "include_np": True},
         {"sources": ["_partition_nodes.pyx"], "language": "c++", "include_np": True},
         {"sources": ["_quad_tree.pyx"], "language": "c++", "include_np": True},
     ],
@@ -514,13 +515,18 @@ def configure_extension_modules():
                 # `source` is a Tempita file
                 tempita_sources.append(source)
 
-                # Do not include pxd files that were generated by tempita
-                if os.path.splitext(new_source_path)[-1] == ".pxd":
-                    continue
-                sources.append(new_source_path)
+                # Only include source files that are pyx files
+                if os.path.splitext(new_source_path)[-1] == ".pyx":
+                    sources.append(new_source_path)
 
             gen_from_templates(tempita_sources)
 
+            # Do not progress if we only have a tempita file which we don't
+            # want to include like the .pxi.tp extension. In such a case
+            # sources would be empty.
+            if not sources:
+                continue
+
             # By convention, our extensions always use the name of the first source
             source_name = os.path.splitext(os.path.basename(sources[0]))[0]
             if submodule:
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index f3b61da0915d5..11cb0e42c47f6 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -113,7 +113,7 @@ class BaseLoss:
         Indicates whether n_classes > 2 is allowed.
     """
 
-    # For decision trees:
+    # For gradient boosted decision trees:
     # This variable indicates whether the loss requires the leaves values to
     # be updated once the tree has been trained. The trees are trained to
     # predict a Newton-Raphson step (see grower._finalize_leaf()). But for
@@ -122,8 +122,8 @@ class BaseLoss:
     # procedure. See the original paper Greedy Function Approximation: A
     # Gradient Boosting Machine by Friedman
     # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
-    need_update_leaves_values = False
     differentiable = True
+    need_update_leaves_values = False
     is_multiclass = False
 
     def __init__(self, closs, link, n_classes=None):
@@ -543,6 +543,10 @@ class AbsoluteError(BaseLoss):
     For a given sample x_i, the absolute error is defined as::
 
         loss(x_i) = |y_true_i - raw_prediction_i|
+
+    Note that the exact hessian = 0 almost everywhere (except at one point, therefore
+    differentiable = False). Optimization routines like in HGBT, however, need a
+    hessian > 0. Therefore, we assign 1.
     """
 
     differentiable = False
@@ -585,6 +589,10 @@ class PinballLoss(BaseLoss):
 
     Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError().
 
+    Note that the exact hessian = 0 almost everywhere (except at one point, therefore
+    differentiable = False). Optimization routines like in HGBT, however, need a
+    hessian > 0. Therefore, we assign 1.
+
     Additional Attributes
     ---------------------
     quantile : float
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 432ca9e25b152..8d9a964aea172 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -378,10 +378,10 @@ def fit(self, X, y, sample_weight=None, **fit_params):
 
             if _routing_enabled():
                 routed_params = process_routing(
-                    obj=self,
-                    method="fit",
+                    self,
+                    "fit",
                     sample_weight=sample_weight,
-                    other_params=fit_params,
+                    **fit_params,
                 )
             else:
                 # sample_weight checks
@@ -450,7 +450,7 @@ def fit(self, X, y, sample_weight=None, **fit_params):
                     cv=cv,
                     method=method_name,
                     n_jobs=self.n_jobs,
-                    fit_params=routed_params.estimator.fit,
+                    params=routed_params.estimator.fit,
                 )
                 predictions = _compute_predictions(
                     pred_method, method_name, X, n_classes
@@ -1186,7 +1186,7 @@ def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):
             f"(Positive class: {self.pos_label})" if self.pos_label is not None else ""
         )
 
-        line_kwargs = {}
+        line_kwargs = {"marker": "s", "linestyle": "-"}
         if name is not None:
             line_kwargs["label"] = name
         line_kwargs.update(**kwargs)
@@ -1195,9 +1195,7 @@ def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):
         existing_ref_line = ref_line_label in self.ax_.get_legend_handles_labels()[1]
         if ref_line and not existing_ref_line:
             self.ax_.plot([0, 1], [0, 1], "k:", label=ref_line_label)
-        self.line_ = self.ax_.plot(self.prob_pred, self.prob_true, "s-", **line_kwargs)[
-            0
-        ]
+        self.line_ = self.ax_.plot(self.prob_pred, self.prob_true, **line_kwargs)[0]
 
         # We always have to show the legend for at least the reference line
         self.ax_.legend(loc="lower right")
diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
index 7280bc31423ae..4dd09c9531c44 100644
--- a/sklearn/cluster/_dbscan.py
+++ b/sklearn/cluster/_dbscan.py
@@ -22,6 +22,8 @@
 from ._dbscan_inner import dbscan_inner
 
 
+# This function is not validated using validate_params because
+# it's just a factory for DBSCAN.
 def dbscan(
     X,
     eps=0.5,
@@ -172,6 +174,9 @@ class DBSCAN(ClusterMixin, BaseEstimator):
     Finds core samples of high density and expands clusters from them.
     Good for data which contains clusters of similar density.
 
+    The worst case memory complexity of DBSCAN is :math:`O({n}^2)`, which can
+    occur when the `eps` param is large and `min_samples` is low.
+
     Read more in the :ref:`User Guide <dbscan>`.
 
     Parameters
@@ -184,8 +189,11 @@ class DBSCAN(ClusterMixin, BaseEstimator):
         and distance function.
 
     min_samples : int, default=5
-        The number of samples (or total weight) in a neighborhood for a point
-        to be considered as a core point. This includes the point itself.
+        The number of samples (or total weight) in a neighborhood for a point to
+        be considered as a core point. This includes the point itself. If
+        `min_samples` is set to a higher value, DBSCAN will find denser clusters,
+        whereas if it is set to a lower value, the found clusters will be more
+        sparse.
 
     metric : str, or callable, default='euclidean'
         The metric to use when calculating distance between instances in a
diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 57de8962250b1..f8a37c52f55dc 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -462,12 +462,12 @@ class HDBSCAN(ClusterMixin, BaseEstimator):
         A distance scaling parameter as used in robust single linkage.
         See [3]_ for more information.
 
-    algorithm : {"auto", "brute", "kdtree", "balltree"}, default="auto"
+    algorithm : {"auto", "brute", "kd_tree", "ball_tree"}, default="auto"
         Exactly which algorithm to use for computing core distances; By default
         this is set to `"auto"` which attempts to use a
         :class:`~sklearn.neighbors.KDTree` tree if possible, otherwise it uses
-        a :class:`~sklearn.neighbors.BallTree` tree. Both `"KDTree"` and
-        `"BallTree"` algorithms use the
+        a :class:`~sklearn.neighbors.BallTree` tree. Both `"kd_tree"` and
+        `"ball_tree"` algorithms use the
         :class:`~sklearn.neighbors.NearestNeighbors` estimator.
 
         If the `X` passed during `fit` is sparse or `metric` is invalid for
@@ -475,6 +475,14 @@ class HDBSCAN(ClusterMixin, BaseEstimator):
         :class:`~sklearn.neighbors.BallTree`, then it resolves to use the
         `"brute"` algorithm.
 
+        .. deprecated:: 1.4
+           The `'kdtree'` option was deprecated in version 1.4,
+           and will be renamed to `'kd_tree'` in 1.6.
+
+        .. deprecated:: 1.4
+           The `'balltree'` option was deprecated in version 1.4,
+           and will be renamed to `'ball_tree'` in 1.6.
+
     leaf_size : int, default=40
         Leaf size for trees responsible for fast nearest neighbour queries when
         a KDTree or a BallTree are used as core-distance algorithms. A large
@@ -625,15 +633,12 @@ class HDBSCAN(ClusterMixin, BaseEstimator):
         "metric": [StrOptions(FAST_METRICS | {"precomputed"}), callable],
         "metric_params": [dict, None],
         "alpha": [Interval(Real, left=0, right=None, closed="neither")],
+        # TODO(1.6): Remove "kdtree" and "balltree"  option
         "algorithm": [
             StrOptions(
-                {
-                    "auto",
-                    "brute",
-                    "kdtree",
-                    "balltree",
-                }
-            )
+                {"auto", "brute", "kd_tree", "ball_tree", "kdtree", "balltree"},
+                deprecated={"kdtree", "balltree"},
+            ),
         ],
         "leaf_size": [Interval(Integral, left=1, right=None, closed="left")],
         "n_jobs": [Integral, None],
@@ -759,6 +764,31 @@ def fit(self, X, y=None):
                 f"min_samples ({self._min_samples}) must be at most the number of"
                 f" samples in X ({X.shape[0]})"
             )
+
+        # TODO(1.6): Remove
+        if self.algorithm == "kdtree":
+            warn(
+                (
+                    "`algorithm='kdtree'`has been deprecated in 1.4 and will be renamed"
+                    " to'kd_tree'`in 1.6. To keep the past behaviour, set"
+                    " `algorithm='kd_tree'`."
+                ),
+                FutureWarning,
+            )
+            self.algorithm = "kd_tree"
+
+        # TODO(1.6): Remove
+        if self.algorithm == "balltree":
+            warn(
+                (
+                    "`algorithm='balltree'`has been deprecated in 1.4 and will be"
+                    " renamed to'ball_tree'`in 1.6. To keep the past behaviour, set"
+                    " `algorithm='ball_tree'`."
+                ),
+                FutureWarning,
+            )
+            self.algorithm = "ball_tree"
+
         mst_func = None
         kwargs = dict(
             X=X,
@@ -768,12 +798,14 @@ def fit(self, X, y=None):
             n_jobs=self.n_jobs,
             **self._metric_params,
         )
-        if self.algorithm == "kdtree" and self.metric not in KDTree.valid_metrics:
+        if self.algorithm == "kd_tree" and self.metric not in KDTree.valid_metrics:
             raise ValueError(
                 f"{self.metric} is not a valid metric for a KDTree-based algorithm."
                 " Please select a different metric."
             )
-        elif self.algorithm == "balltree" and self.metric not in BallTree.valid_metrics:
+        elif (
+            self.algorithm == "ball_tree" and self.metric not in BallTree.valid_metrics
+        ):
             raise ValueError(
                 f"{self.metric} is not a valid metric for a BallTree-based algorithm."
                 " Please select a different metric."
@@ -790,11 +822,11 @@ def fit(self, X, y=None):
             if self.algorithm == "brute":
                 mst_func = _hdbscan_brute
                 kwargs["copy"] = self.copy
-            elif self.algorithm == "kdtree":
+            elif self.algorithm == "kd_tree":
                 mst_func = _hdbscan_prims
                 kwargs["algo"] = "kd_tree"
                 kwargs["leaf_size"] = self.leaf_size
-            elif self.algorithm == "balltree":
+            else:
                 mst_func = _hdbscan_prims
                 kwargs["algo"] = "ball_tree"
                 kwargs["leaf_size"] = self.leaf_size
diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py
index c0c281ce31475..63087e75185dc 100644
--- a/sklearn/cluster/tests/test_hdbscan.py
+++ b/sklearn/cluster/tests/test_hdbscan.py
@@ -28,8 +28,8 @@
 X = StandardScaler().fit_transform(X)
 
 ALGORITHMS = [
-    "kdtree",
-    "balltree",
+    "kd_tree",
+    "ball_tree",
     "brute",
     "auto",
 ]
@@ -149,8 +149,8 @@ def test_hdbscan_algorithms(algo, metric):
         return
 
     ALGOS_TREES = {
-        "kdtree": KDTree,
-        "balltree": BallTree,
+        "kd_tree": KDTree,
+        "ball_tree": BallTree,
     }
     metric_params = {
         "mahalanobis": {"V": np.eye(X.shape[1])},
@@ -287,22 +287,37 @@ def test_hdbscan_precomputed_non_brute(tree):
 def test_hdbscan_sparse():
     """
     Tests that HDBSCAN works correctly when passing sparse feature data.
+    Evaluates correctness by comparing against the same data passed as a dense
+    array.
     """
-    sparse_X = sparse.csr_matrix(X)
 
-    labels = HDBSCAN().fit(sparse_X).labels_
-    n_clusters = len(set(labels) - OUTLIER_SET)
+    dense_labels = HDBSCAN().fit(X).labels_
+    n_clusters = len(set(dense_labels) - OUTLIER_SET)
     assert n_clusters == 3
 
-    sparse_X_nan = sparse_X.copy()
-    sparse_X_nan[0, 0] = np.nan
-    labels = HDBSCAN().fit(sparse_X_nan).labels_
-    n_clusters = len(set(labels) - OUTLIER_SET)
-    assert n_clusters == 3
+    _X_sparse = sparse.csr_matrix(X)
+    X_sparse = _X_sparse.copy()
+    sparse_labels = HDBSCAN().fit(X_sparse).labels_
+    assert_array_equal(dense_labels, sparse_labels)
+
+    # Compare that the sparse and dense non-precomputed routines return the same labels
+    # where the 0th observation contains the outlier.
+    for outlier_val, outlier_type in ((np.inf, "infinite"), (np.nan, "missing")):
+        X_dense = X.copy()
+        X_dense[0, 0] = outlier_val
+        dense_labels = HDBSCAN().fit(X_dense).labels_
+        n_clusters = len(set(dense_labels) - OUTLIER_SET)
+        assert n_clusters == 3
+        assert dense_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"]
+
+        X_sparse = _X_sparse.copy()
+        X_sparse[0, 0] = outlier_val
+        sparse_labels = HDBSCAN().fit(X_sparse).labels_
+        assert_array_equal(dense_labels, sparse_labels)
 
     msg = "Sparse data matrices only support algorithm `brute`."
     with pytest.raises(ValueError, match=msg):
-        HDBSCAN(metric="euclidean", algorithm="balltree").fit(sparse_X)
+        HDBSCAN(metric="euclidean", algorithm="ball_tree").fit(X_sparse)
 
 
 @pytest.mark.parametrize("algorithm", ALGORITHMS)
@@ -353,7 +368,7 @@ def test_hdbscan_allow_single_cluster_with_epsilon():
         cluster_selection_epsilon=0.18,
         cluster_selection_method="eom",
         allow_single_cluster=True,
-        algorithm="kdtree",
+        algorithm="kd_tree",
     ).fit_predict(no_structure)
     unique_labels, counts = np.unique(labels, return_counts=True)
     assert len(unique_labels) == 2
@@ -418,16 +433,16 @@ def test_hdbscan_tree_invalid_metric():
 
     # Callables are not supported for either
     with pytest.raises(ValueError, match=msg):
-        HDBSCAN(algorithm="kdtree", metric=metric_callable).fit(X)
+        HDBSCAN(algorithm="kd_tree", metric=metric_callable).fit(X)
     with pytest.raises(ValueError, match=msg):
-        HDBSCAN(algorithm="balltree", metric=metric_callable).fit(X)
+        HDBSCAN(algorithm="ball_tree", metric=metric_callable).fit(X)
 
     # The set of valid metrics for KDTree at the time of writing this test is a
     # strict subset of those supported in BallTree
     metrics_not_kd = list(set(BallTree.valid_metrics) - set(KDTree.valid_metrics))
     if len(metrics_not_kd) > 0:
         with pytest.raises(ValueError, match=msg):
-            HDBSCAN(algorithm="kdtree", metric=metrics_not_kd[0]).fit(X)
+            HDBSCAN(algorithm="kd_tree", metric=metrics_not_kd[0]).fit(X)
 
 
 def test_hdbscan_too_many_min_samples():
@@ -531,3 +546,23 @@ def test_labelling_thresholding():
     # and the largest value is exactly MAX_LAMBDA.
     num_noise = condensed_tree["value"] < MAX_LAMBDA
     assert sum(num_noise) == sum(labels == -1)
+
+
+# TODO(1.6): Remove
+def test_hdbscan_warning_on_deprecated_algorithm_name():
+    # Test that warning message is shown when algorithm='kdtree'
+    msg = (
+        "`algorithm='kdtree'`has been deprecated in 1.4 and will be renamed"
+        " to'kd_tree'`in 1.6. To keep the past behaviour, set `algorithm='kd_tree'`."
+    )
+    with pytest.warns(FutureWarning, match=msg):
+        HDBSCAN(algorithm="kdtree").fit(X)
+
+    # Test that warning message is shown when algorithm='balltree'
+    msg = (
+        "`algorithm='balltree'`has been deprecated in 1.4 and will be renamed"
+        " to'ball_tree'`in 1.6. To keep the past behaviour, set"
+        " `algorithm='ball_tree'`."
+    )
+    with pytest.warns(FutureWarning, match=msg):
+        HDBSCAN(algorithm="balltree").fit(X)
diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index f1fc90af11d82..822a13064bb08 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -238,7 +238,10 @@ def fit(self, X, Y):
             Y, input_name="Y", dtype=np.float64, copy=self.copy, ensure_2d=False
         )
         if Y.ndim == 1:
+            self._predict_1d = True
             Y = Y.reshape(-1, 1)
+        else:
+            self._predict_1d = False
 
         n = X.shape[0]
         p = X.shape[1]
@@ -469,8 +472,8 @@ def predict(self, X, copy=True):
         # Normalize
         X -= self._x_mean
         X /= self._x_std
-        Ypred = X @ self.coef_.T
-        return Ypred + self.intercept_
+        Ypred = X @ self.coef_.T + self.intercept_
+        return Ypred.ravel() if self._predict_1d else Ypred
 
     def fit_transform(self, X, y=None):
         """Learn and apply the dimension reduction on the train data.
diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py
index fcdd927efb389..b8b5cbaa0f275 100644
--- a/sklearn/cross_decomposition/tests/test_pls.py
+++ b/sklearn/cross_decomposition/tests/test_pls.py
@@ -12,7 +12,9 @@
     _svd_flip_1d,
 )
 from sklearn.datasets import load_linnerud, make_regression
+from sklearn.ensemble import VotingRegressor
 from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model import LinearRegression
 from sklearn.utils import check_random_state
 from sklearn.utils.extmath import svd_flip
 
@@ -621,3 +623,24 @@ def test_pls_set_output(Klass):
     assert isinstance(y_trans, np.ndarray)
     assert isinstance(X_trans, pd.DataFrame)
     assert_array_equal(X_trans.columns, est.get_feature_names_out())
+
+
+def test_pls_regression_fit_1d_y():
+    """Check that when fitting with 1d `y`, prediction should also be 1d.
+
+    Non-regression test for Issue #26549.
+    """
+    X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]])
+    y = np.array([2, 6, 12, 20, 30, 42])
+    expected = y.copy()
+
+    plsr = PLSRegression().fit(X, y)
+    y_pred = plsr.predict(X)
+    assert y_pred.shape == expected.shape
+
+    # Check that it works in VotingRegressor
+    lr = LinearRegression().fit(X, y)
+    vr = VotingRegressor([("lr", lr), ("plsr", plsr)])
+    y_pred = vr.fit(X, y).predict(X)
+    assert y_pred.shape == expected.shape
+    assert_allclose(y_pred, expected)
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index f4a574c62c5e9..7348044e0d8fa 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -66,6 +66,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from sklearn.utils import check_random_state, compute_sample_weight
 from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
 from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions
+from sklearn.utils._tags import _safe_tags
 from sklearn.utils.multiclass import check_classification_targets, type_of_target
 from sklearn.utils.parallel import Parallel, delayed
 from sklearn.utils.validation import (
@@ -163,6 +164,7 @@ def _parallel_build_trees(
     verbose=0,
     class_weight=None,
     n_samples_bootstrap=None,
+    missing_values_in_feature_mask=None,
 ):
     """
     Private function used to fit a single tree in parallel."""
@@ -189,9 +191,21 @@ def _parallel_build_trees(
         elif class_weight == "balanced_subsample":
             curr_sample_weight *= compute_sample_weight("balanced", y, indices=indices)
 
-        tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
+        tree._fit(
+            X,
+            y,
+            sample_weight=curr_sample_weight,
+            check_input=False,
+            missing_values_in_feature_mask=missing_values_in_feature_mask,
+        )
     else:
-        tree.fit(X, y, sample_weight=sample_weight, check_input=False)
+        tree._fit(
+            X,
+            y,
+            sample_weight=sample_weight,
+            check_input=False,
+            missing_values_in_feature_mask=missing_values_in_feature_mask,
+        )
 
     return tree
 
@@ -367,9 +381,26 @@ def fit(self, X, y, sample_weight=None):
         # Validate or convert input data
         if issparse(y):
             raise ValueError("sparse multilabel-indicator for y is not supported.")
+
         X, y = self._validate_data(
-            X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
+            X,
+            y,
+            multi_output=True,
+            accept_sparse="csc",
+            dtype=DTYPE,
+            force_all_finite=False,
+        )
+        # _compute_missing_values_in_feature_mask checks if X has missing values and
+        # will raise an error if the underlying tree base estimator can't handle missing
+        # values. Only the criterion is required to determine if the tree supports
+        # missing values.
+        estimator = type(self.estimator)(criterion=self.criterion)
+        missing_values_in_feature_mask = (
+            estimator._compute_missing_values_in_feature_mask(
+                X, estimator_name=self.__class__.__name__
+            )
         )
+
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X)
 
@@ -523,6 +554,7 @@ def fit(self, X, y, sample_weight=None):
                     verbose=self.verbose,
                     class_weight=self.class_weight,
                     n_samples_bootstrap=n_samples_bootstrap,
+                    missing_values_in_feature_mask=missing_values_in_feature_mask,
                 )
                 for i, t in enumerate(trees)
             )
@@ -650,7 +682,18 @@ def _validate_X_predict(self, X):
         """
         Validate X whenever one tries to predict, apply, predict_proba."""
         check_is_fitted(self)
-        X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False)
+        if self.estimators_[0]._support_missing_values(X):
+            force_all_finite = "allow-nan"
+        else:
+            force_all_finite = True
+
+        X = self._validate_data(
+            X,
+            dtype=DTYPE,
+            accept_sparse="csr",
+            reset=False,
+            force_all_finite=force_all_finite,
+        )
         if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):
             raise ValueError("No support for np.int64 index based sparse matrices")
         return X
@@ -858,6 +901,12 @@ def get_leaf_node_samples(self, X):
                 leaf_nodes_samples[i] = np.vstack((leaf_nodes_samples[i], node_samples))
         return leaf_nodes_samples
 
+    def _more_tags(self):
+        # Only the criterion is required to determine if the tree supports
+        # missing values
+        estimator = type(self.estimator)(criterion=self.criterion)
+        return {"allow_nan": _safe_tags(estimator, key="allow_nan")}
+
 
 def _accumulate_prediction(predict, X, out, lock):
     """
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index 3a14da52047ad..f1e7b7d6e063a 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -243,13 +243,14 @@ def _fit_stage(
                 # no inplace multiplication!
                 sample_weight = sample_weight * sample_mask.astype(np.float64)
 
-            X = X_csr if X_csr is not None else X
+            X = X_csc if X_csc is not None else X
             tree.fit(X, residual, sample_weight=sample_weight, check_input=False)
 
             # update tree leaves
+            X_for_tree_update = X_csr if X_csr is not None else X
             loss.update_terminal_regions(
                 tree.tree_,
-                X,
+                X_for_tree_update,
                 y,
                 residual,
                 raw_predictions,
@@ -434,16 +435,18 @@ def fit(self, X, y, sample_weight=None, monitor=None):
 
         if self.n_iter_no_change is not None:
             stratify = y if is_classifier(self) else None
-            X, X_val, y, y_val, sample_weight, sample_weight_val = train_test_split(
-                X,
-                y,
-                sample_weight,
-                random_state=self.random_state,
-                test_size=self.validation_fraction,
-                stratify=stratify,
+            X_train, X_val, y_train, y_val, sample_weight_train, sample_weight_val = (
+                train_test_split(
+                    X,
+                    y,
+                    sample_weight,
+                    random_state=self.random_state,
+                    test_size=self.validation_fraction,
+                    stratify=stratify,
+                )
             )
             if is_classifier(self):
-                if self._n_classes != np.unique(y).shape[0]:
+                if self._n_classes != np.unique(y_train).shape[0]:
                     # We choose to error here. The problem is that the init
                     # estimator would be trained on y, which has some missing
                     # classes now, so its predictions would not have the
@@ -454,6 +457,7 @@ def fit(self, X, y, sample_weight=None, monitor=None):
                         "seed."
                     )
         else:
+            X_train, y_train, sample_weight_train = X, y, sample_weight
             X_val = y_val = sample_weight_val = None
 
         if not self._is_initialized():
@@ -463,19 +467,21 @@ def fit(self, X, y, sample_weight=None, monitor=None):
             # fit initial model and initialize raw predictions
             if self.init_ == "zero":
                 raw_predictions = np.zeros(
-                    shape=(X.shape[0], self._loss.K), dtype=np.float64
+                    shape=(X_train.shape[0], self._loss.K), dtype=np.float64
                 )
             else:
                 # XXX clean this once we have a support_sample_weight tag
                 if sample_weight_is_none:
-                    self.init_.fit(X, y)
+                    self.init_.fit(X_train, y_train)
                 else:
                     msg = (
                         "The initial estimator {} does not support sample "
                         "weights.".format(self.init_.__class__.__name__)
                     )
                     try:
-                        self.init_.fit(X, y, sample_weight=sample_weight)
+                        self.init_.fit(
+                            X_train, y_train, sample_weight=sample_weight_train
+                        )
                     except TypeError as e:
                         if "unexpected keyword argument 'sample_weight'" in str(e):
                             # regular estimator without SW support
@@ -493,7 +499,9 @@ def fit(self, X, y, sample_weight=None, monitor=None):
                         else:  # regular estimator whose input checking failed
                             raise
 
-                raw_predictions = self._loss.get_init_raw_predictions(X, self.init_)
+                raw_predictions = self._loss.get_init_raw_predictions(
+                    X_train, self.init_
+                )
 
             begin_at_stage = 0
 
@@ -513,22 +521,22 @@ def fit(self, X, y, sample_weight=None, monitor=None):
             # The requirements of _raw_predict
             # are more constrained than fit. It accepts only CSR
             # matrices. Finite values have already been checked in _validate_data.
-            X = check_array(
-                X,
+            X_train = check_array(
+                X_train,
                 dtype=DTYPE,
                 order="C",
                 accept_sparse="csr",
                 force_all_finite=False,
             )
-            raw_predictions = self._raw_predict(X)
+            raw_predictions = self._raw_predict(X_train)
             self._resize_state()
 
         # fit the boosting stages
         n_stages = self._fit_stages(
-            X,
-            y,
+            X_train,
+            y_train,
             raw_predictions,
-            sample_weight,
+            sample_weight_train,
             self._rng,
             X_val,
             y_val,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 5d030d3add5bb..c3af930654b73 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -59,13 +59,23 @@ def _update_leaves_values(loss, grower, y_true, raw_prediction, sample_weight):
     Update equals:
         loss.fit_intercept_only(y_true - raw_prediction)
 
-    This is only applied if loss.need_update_leaves_values is True.
+    This is only applied if loss.differentiable is False.
     Note: It only works, if the loss is a function of the residual, as is the
     case for AbsoluteError and PinballLoss. Otherwise, one would need to get
     the minimum of loss(y_true, raw_prediction + x) in x. A few examples:
       - AbsoluteError: median(y_true - raw_prediction).
       - PinballLoss: quantile(y_true - raw_prediction).
-    See also notes about need_update_leaves_values in BaseLoss.
+
+    More background:
+    For the standard gradient descent method according to "Greedy Function
+    Approximation: A Gradient Boosting Machine" by Friedman, all loss functions but the
+    squared loss need a line search step. BaseHistGradientBoosting, however, implements
+    a so called Newton boosting where the trees are fitted to a 2nd order
+    approximations of the loss in terms of gradients and hessians. In this case, the
+    line search step is only necessary if the loss is not smooth, i.e. not
+    differentiable, which renders the 2nd order approximation invalid. In fact,
+    non-smooth losses arbitrarily set hessians to 1 and effectively use the standard
+    gradient descent method with line search.
     """
     # TODO: Ideally this should be computed in parallel over the leaves using something
     # similar to _update_raw_predictions(), but this requires a cython version of
@@ -699,7 +709,7 @@ def fit(self, X, y, sample_weight=None):
                 acc_find_split_time += grower.total_find_split_time
                 acc_compute_hist_time += grower.total_compute_hist_time
 
-                if self._loss.need_update_leaves_values:
+                if not self._loss.differentiable:
                     _update_leaves_values(
                         loss=self._loss,
                         grower=grower,
diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py
index 2129e4d9a0134..539d97fbf345e 100644
--- a/sklearn/ensemble/_stacking.py
+++ b/sklearn/ensemble/_stacking.py
@@ -254,7 +254,7 @@ def fit(self, X, y, sample_weight=None):
                     cv=deepcopy(cv),
                     method=meth,
                     n_jobs=self.n_jobs,
-                    fit_params=fit_params,
+                    params=fit_params,
                     verbose=self.verbose,
                 )
                 for est, meth in zip(all_estimators, self.stack_method_)
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index 9291b6982a923..efc5d7d5ee5a4 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -2031,3 +2031,94 @@ def test_multioutput_quantiles(name):
     assert len(leaf_nodes_samples) == len(X_test)
     for node_samples in leaf_nodes_samples:
         assert node_samples.shape[1] == est.n_outputs_
+
+
+@pytest.mark.parametrize(
+    "make_data, Forest",
+    [
+        (datasets.make_regression, RandomForestRegressor),
+        (datasets.make_classification, RandomForestClassifier),
+    ],
+)
+def test_missing_values_is_resilient(make_data, Forest):
+    """Check that forest can deal with missing values and has decent performance."""
+
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 1000, 10
+    X, y = make_data(n_samples=n_samples, n_features=n_features, random_state=rng)
+
+    # Create dataset with missing values
+    X_missing = X.copy()
+    X_missing[rng.choice([False, True], size=X.shape, p=[0.95, 0.05])] = np.nan
+    assert np.isnan(X_missing).any()
+
+    X_missing_train, X_missing_test, y_train, y_test = train_test_split(
+        X_missing, y, random_state=0
+    )
+
+    # Train forest with missing values
+    forest_with_missing = Forest(random_state=rng, n_estimators=50)
+    forest_with_missing.fit(X_missing_train, y_train)
+    score_with_missing = forest_with_missing.score(X_missing_test, y_test)
+
+    # Train forest without missing values
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    forest = Forest(random_state=rng, n_estimators=50)
+    forest.fit(X_train, y_train)
+    score_without_missing = forest.score(X_test, y_test)
+
+    # Score is still 80 percent of the forest's score that had no missing values
+    assert score_with_missing >= 0.80 * score_without_missing
+
+
+@pytest.mark.parametrize("Forest", [RandomForestClassifier, RandomForestRegressor])
+def test_missing_value_is_predictive(Forest):
+    """Check that the forest learns when missing values are only present for
+    a predictive feature."""
+    rng = np.random.RandomState(0)
+    n_samples = 300
+
+    X_non_predictive = rng.standard_normal(size=(n_samples, 10))
+    y = rng.randint(0, high=2, size=n_samples)
+
+    # Create a predictive feature using `y` and with some noise
+    X_random_mask = rng.choice([False, True], size=n_samples, p=[0.95, 0.05])
+    y_mask = y.astype(bool)
+    y_mask[X_random_mask] = ~y_mask[X_random_mask]
+
+    predictive_feature = rng.standard_normal(size=n_samples)
+    predictive_feature[y_mask] = np.nan
+    assert np.isnan(predictive_feature).any()
+
+    X_predictive = X_non_predictive.copy()
+    X_predictive[:, 5] = predictive_feature
+
+    (
+        X_predictive_train,
+        X_predictive_test,
+        X_non_predictive_train,
+        X_non_predictive_test,
+        y_train,
+        y_test,
+    ) = train_test_split(X_predictive, X_non_predictive, y, random_state=0)
+    forest_predictive = Forest(random_state=0).fit(X_predictive_train, y_train)
+    forest_non_predictive = Forest(random_state=0).fit(X_non_predictive_train, y_train)
+
+    predictive_test_score = forest_predictive.score(X_predictive_test, y_test)
+
+    assert predictive_test_score >= 0.75
+    assert predictive_test_score >= forest_non_predictive.score(
+        X_non_predictive_test, y_test
+    )
+
+
+def test_non_supported_criterion_raises_error_with_missing_values():
+    """Raise error for unsupported criterion when there are missing values."""
+    X = np.array([[0, 1, 2], [np.nan, 0, 2.0]])
+    y = [0.5, 1.0]
+
+    forest = RandomForestRegressor(criterion="absolute_error")
+
+    msg = "RandomForestRegressor does not accept missing values"
+    with pytest.raises(ValueError, match=msg):
+        forest.fit(X, y)
diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
index 11cf083992653..b3dffa5494b0d 100644
--- a/sklearn/feature_selection/_rfe.py
+++ b/sklearn/feature_selection/_rfe.py
@@ -33,7 +33,12 @@ def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):
         X_train,
         y_train,
         lambda estimator, features: _score(
-            estimator, X_test[:, features], y_test, scorer
+            # TODO(SLEP6): pass score_params here
+            estimator,
+            X_test[:, features],
+            y_test,
+            scorer,
+            score_params=None,
         ),
     ).scores_
 
diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
index db0da278b39ef..e36b49f262b2d 100644
--- a/sklearn/impute/_knn.py
+++ b/sklearn/impute/_knn.py
@@ -282,7 +282,12 @@ def transform(self, X):
                 Xc[:, ~valid_mask] = 0
             else:
                 Xc = X[:, valid_mask]
-            return Xc
+
+            # Even if there are no missing values in X, we still concatenate Xc
+            # with the missing value indicator matrix, X_indicator.
+            # This is to ensure that the output maintains consistency in terms
+            # of columns, regardless of whether missing values exist in X or not.
+            return super()._concatenate_indicator(Xc, X_indicator)
 
         row_missing_idx = np.flatnonzero(mask.any(axis=1))
 
diff --git a/sklearn/impute/tests/test_common.py b/sklearn/impute/tests/test_common.py
index aad7eb12a0a92..be2fa6e4d1736 100644
--- a/sklearn/impute/tests/test_common.py
+++ b/sklearn/impute/tests/test_common.py
@@ -181,3 +181,39 @@ def test_keep_empty_features(imputer, keep_empty_features):
             assert X_imputed.shape == X.shape
         else:
             assert X_imputed.shape == (X.shape[0], X.shape[1] - 1)
+
+
+@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
+@pytest.mark.parametrize("missing_value_test", [np.nan, 1])
+def test_imputation_adds_missing_indicator_if_add_indicator_is_true(
+    imputer, missing_value_test
+):
+    """Check that missing indicator always exists when add_indicator=True.
+
+    Non-regression test for gh-26590.
+    """
+    X_train = np.array([[0, np.NaN], [1, 2]])
+
+    # Test data where missing_value_test variable can be set to np.NaN or 1.
+    X_test = np.array([[0, missing_value_test], [1, 2]])
+
+    imputer.set_params(add_indicator=True)
+    imputer.fit(X_train)
+
+    X_test_imputed_with_indicator = imputer.transform(X_test)
+    assert X_test_imputed_with_indicator.shape == (2, 3)
+
+    imputer.set_params(add_indicator=False)
+    imputer.fit(X_train)
+    X_test_imputed_without_indicator = imputer.transform(X_test)
+    assert X_test_imputed_without_indicator.shape == (2, 2)
+
+    assert_allclose(
+        X_test_imputed_with_indicator[:, :-1], X_test_imputed_without_indicator
+    )
+    if np.isnan(missing_value_test):
+        expected_missing_indicator = [1, 0]
+    else:
+        expected_missing_indicator = [0, 0]
+
+    assert_allclose(X_test_imputed_with_indicator[:, -1], expected_missing_indicator)
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 1a9bc7216a0b5..a76fd98940ad4 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -856,8 +856,9 @@ class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator):
            in 1.4. Use `None` instead.
 
     dual : bool, default=False
-        Dual or primal formulation. Dual formulation is only implemented for
-        l2 penalty with liblinear solver. Prefer dual=False when
+        Dual (constrained) or primal (regularized, see also
+        :ref:`this equation <regularized-logistic-loss>`) formulation. Dual formulation
+        is only implemented for l2 penalty with liblinear solver. Prefer dual=False when
         n_samples > n_features.
 
     tol : float, default=1e-4
@@ -1474,8 +1475,9 @@ class LogisticRegressionCV(LogisticRegression, LinearClassifierMixin, BaseEstima
             ``cv`` default value if None changed from 3-fold to 5-fold.
 
     dual : bool, default=False
-        Dual or primal formulation. Dual formulation is only implemented for
-        l2 penalty with liblinear solver. Prefer dual=False when
+        Dual (constrained) or primal (regularized, see also
+        :ref:`this equation <regularized-logistic-loss>`) formulation. Dual formulation
+        is only implemented for l2 penalty with liblinear solver. Prefer dual=False when
         n_samples > n_features.
 
     penalty : {'l1', 'l2', 'elasticnet'}, default='l2'
@@ -1857,10 +1859,10 @@ def fit(self, X, y, sample_weight=None, **params):
 
         if _routing_enabled():
             routed_params = process_routing(
-                obj=self,
-                method="fit",
+                self,
+                "fit",
                 sample_weight=sample_weight,
-                other_params=params,
+                **params,
             )
         else:
             routed_params = Bunch()
@@ -2148,10 +2150,10 @@ def score(self, X, y, sample_weight=None, **score_params):
         scoring = self._get_scorer()
         if _routing_enabled():
             routed_params = process_routing(
-                obj=self,
-                method="score",
+                self,
+                "score",
                 sample_weight=sample_weight,
-                other_params=score_params,
+                **score_params,
             )
         else:
             routed_params = Bunch()
diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp
index 60b8da3ecfa46..313225088c776 100644
--- a/sklearn/metrics/_dist_metrics.pxd.tp
+++ b/sklearn/metrics/_dist_metrics.pxd.tp
@@ -71,26 +71,26 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
     cdef object func
     cdef object kwargs
 
-    cdef float64_t dist(
+    cdef {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
         intp_t size,
     ) except -1 nogil
 
-    cdef float64_t rdist(
+    cdef {{INPUT_DTYPE_t}} rdist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
         intp_t size,
     ) except -1 nogil
 
-    cdef float64_t dist_csr(
+    cdef {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -98,12 +98,12 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
         const intp_t size,
     ) except -1 nogil
 
-    cdef float64_t rdist_csr(
+    cdef {{INPUT_DTYPE_t}} rdist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -114,39 +114,39 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
     cdef int pdist(
         self,
         const {{INPUT_DTYPE_t}}[:, ::1] X,
-        float64_t[:, ::1] D,
+        {{INPUT_DTYPE_t}}[:, ::1] D,
     ) except -1
 
     cdef int cdist(
         self,
         const {{INPUT_DTYPE_t}}[:, ::1] X,
         const {{INPUT_DTYPE_t}}[:, ::1] Y,
-        float64_t[:, ::1] D,
+        {{INPUT_DTYPE_t}}[:, ::1] D,
     ) except -1
 
     cdef int pdist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
-        const int32_t[:] x1_indptr,
+        const int32_t[::1] x1_indices,
+        const int32_t[::1] x1_indptr,
         const intp_t size,
-        float64_t[:, ::1] D,
+        {{INPUT_DTYPE_t}}[:, ::1] D,
     ) except -1 nogil
 
     cdef int cdist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
-        const int32_t[:] x1_indptr,
+        const int32_t[::1] x1_indices,
+        const int32_t[::1] x1_indptr,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
-        const int32_t[:] x2_indptr,
+        const int32_t[::1] x2_indices,
+        const int32_t[::1] x2_indptr,
         const intp_t size,
-        float64_t[:, ::1] D,
+        {{INPUT_DTYPE_t}}[:, ::1] D,
     ) except -1 nogil
 
-    cdef float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil
+    cdef {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil
 
-    cdef float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil
+    cdef {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil
 
 {{endfor}}
diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp
index bc54e51a7511a..6b5ea300f038b 100644
--- a/sklearn/metrics/_dist_metrics.pyx.tp
+++ b/sklearn/metrics/_dist_metrics.pyx.tp
@@ -65,6 +65,118 @@ def get_valid_metric_ids(L):
             if (val.__name__ in L) or (val in L)]
 
 cdef class DistanceMetric:
+    """Uniform interface for fast distance metric functions.
+
+    The `DistanceMetric` class provides a convenient way to compute pairwise distances
+    between samples. It supports various distance metrics, such as Euclidean distance,
+    Manhattan distance, and more.
+
+    The `pairwise` method can be used to compute pairwise distances between samples in
+    the input arrays. It returns a distance matrix representing the distances between
+    all pairs of samples.
+
+    The :meth:`get_metric` method allows you to retrieve a specific metric using its
+    string identifier.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import DistanceMetric
+    >>> dist = DistanceMetric.get_metric('euclidean')
+    >>> X = [[1, 2], [3, 4], [5, 6]]
+    >>> Y = [[7, 8], [9, 10]]
+    >>> dist.pairwise(X,Y)
+    array([[7.81..., 10.63...]
+           [5.65...,  8.48...]
+           [1.41...,  4.24...]])
+
+    Available Metrics
+
+    The following lists the string metric identifiers and the associated
+    distance metric classes:
+
+    **Metrics intended for real-valued vector spaces:**
+
+    ==============  ====================  ========  ===============================
+    identifier      class name            args      distance function
+    --------------  --------------------  --------  -------------------------------
+    "euclidean"     EuclideanDistance     -         ``sqrt(sum((x - y)^2))``
+    "manhattan"     ManhattanDistance     -         ``sum(|x - y|)``
+    "chebyshev"     ChebyshevDistance     -         ``max(|x - y|)``
+    "minkowski"     MinkowskiDistance     p, w      ``sum(w * |x - y|^p)^(1/p)``
+    "seuclidean"    SEuclideanDistance    V         ``sqrt(sum((x - y)^2 / V))``
+    "mahalanobis"   MahalanobisDistance   V or VI   ``sqrt((x - y)' V^-1 (x - y))``
+    ==============  ====================  ========  ===============================
+
+    **Metrics intended for two-dimensional vector spaces:**  Note that the haversine
+    distance metric requires data in the form of [latitude, longitude] and both
+    inputs and outputs are in units of radians.
+
+    ============  ==================  ===============================================================
+    identifier    class name          distance function
+    ------------  ------------------  ---------------------------------------------------------------
+    "haversine"   HaversineDistance   ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))``
+    ============  ==================  ===============================================================
+
+
+    **Metrics intended for integer-valued vector spaces:**  Though intended
+    for integer-valued vectors, these are also valid metrics in the case of
+    real-valued vectors.
+
+    =============  ====================  ========================================
+    identifier     class name            distance function
+    -------------  --------------------  ----------------------------------------
+    "hamming"      HammingDistance       ``N_unequal(x, y) / N_tot``
+    "canberra"     CanberraDistance      ``sum(|x - y| / (|x| + |y|))``
+    "braycurtis"   BrayCurtisDistance    ``sum(|x - y|) / (sum(|x|) + sum(|y|))``
+    =============  ====================  ========================================
+
+    **Metrics intended for boolean-valued vector spaces:**  Any nonzero entry
+    is evaluated to "True".  In the listings below, the following
+    abbreviations are used:
+
+     - N  : number of dimensions
+     - NTT : number of dims in which both values are True
+     - NTF : number of dims in which the first value is True, second is False
+     - NFT : number of dims in which the first value is False, second is True
+     - NFF : number of dims in which both values are False
+     - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT
+     - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT
+
+    =================  =======================  ===============================
+    identifier         class name               distance function
+    -----------------  -----------------------  -------------------------------
+    "jaccard"          JaccardDistance          NNEQ / NNZ
+    "matching"         MatchingDistance         NNEQ / N
+    "dice"             DiceDistance             NNEQ / (NTT + NNZ)
+    "kulsinski"        KulsinskiDistance        (NNEQ + N - NTT) / (NNEQ + N)
+    "rogerstanimoto"   RogersTanimotoDistance   2 * NNEQ / (N + NNEQ)
+    "russellrao"       RussellRaoDistance       (N - NTT) / N
+    "sokalmichener"    SokalMichenerDistance    2 * NNEQ / (N + NNEQ)
+    "sokalsneath"      SokalSneathDistance      NNEQ / (NNEQ + 0.5 * NTT)
+    =================  =======================  ===============================
+
+    **User-defined distance:**
+
+    ===========    ===============    =======
+    identifier     class name         args
+    -----------    ---------------    -------
+    "pyfunc"       PyFuncDistance     func
+    ===========    ===============    =======
+
+    Here ``func`` is a function which takes two one-dimensional numpy
+    arrays, and returns a distance.  Note that in order to be used within
+    the BallTree, the distance must be a true metric:
+    i.e. it must satisfy the following properties
+
+    1) Non-negativity: d(x, y) >= 0
+    2) Identity: d(x, y) = 0 if and only if x == y
+    3) Symmetry: d(x, y) = d(y, x)
+    4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z)
+
+    Because of the Python object overhead involved in calling the python
+    function, this will be fairly slow, but it will have the same
+    scaling as other distances.
+    """
     @classmethod
     def get_metric(cls, metric, dtype=np.float64, **kwargs):
         """Get the given distance metric from the string identifier.
@@ -74,11 +186,24 @@ cdef class DistanceMetric:
         Parameters
         ----------
         metric : str or class name
-            The distance metric to use
+            The string identifier or class name of the desired distance metric.
+            See the documentation of the `DistanceMetric` class for a list of
+            available metrics.
+
         dtype : {np.float32, np.float64}, default=np.float64
-            The dtype of the data on which the metric will be applied
+            The data type of the input on which the metric will be applied.
+            This affects the precision of the computed distances.
+            By default, it is set to `np.float64`.
+
         **kwargs
-            additional arguments will be passed to the requested metric
+            Additional keyword arguments that will be passed to the requested metric.
+            These arguments can be used to customize the behavior of the specific
+            metric.
+
+        Returns
+        -------
+        metric_obj : instance of the requested metric
+            An instance of the requested distance metric class.
         """
         if dtype == np.float32:
             specialized_class = DistanceMetric32
@@ -332,7 +457,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
         """
         return
 
-    cdef float64_t dist(
+    cdef {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -344,7 +469,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
         """
         return -999
 
-    cdef float64_t rdist(
+    cdef {{INPUT_DTYPE_t}} rdist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -364,7 +489,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
     cdef int pdist(
         self,
         const {{INPUT_DTYPE_t}}[:, ::1] X,
-        float64_t[:, ::1] D,
+        {{INPUT_DTYPE_t}}[:, ::1] D,
     ) except -1:
         """Compute the pairwise distances between points in X"""
         cdef intp_t i1, i2
@@ -379,7 +504,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
         self,
         const {{INPUT_DTYPE_t}}[:, ::1] X,
         const {{INPUT_DTYPE_t}}[:, ::1] Y,
-        float64_t[:, ::1] D,
+        {{INPUT_DTYPE_t}}[:, ::1] D,
     ) except -1:
         """Compute the cross-pairwise distances between arrays X and Y"""
         cdef intp_t i1, i2
@@ -390,12 +515,12 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
                 D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1])
         return 0
 
-    cdef float64_t dist_csr(
+    cdef {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -420,12 +545,12 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
 
         2. An alternative signature would be:
 
-            cdef float64_t dist_csr(
+            cdef {{INPUT_DTYPE_t}} dist_csr(
                 self,
                 const {{INPUT_DTYPE_t}}* x1_data,
-                const int32_t[:] x1_indices,
+                const int32_t* x1_indices,
                 const {{INPUT_DTYPE_t}}* x2_data,
-                const int32_t[:] x2_indices,
+                const int32_t* x2_indices,
             ) except -1 nogil:
 
         Where callers would use slicing on the original CSR data and indices
@@ -456,12 +581,12 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
         """
         return -999
 
-    cdef float64_t rdist_csr(
+    cdef {{INPUT_DTYPE_t}} rdist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -500,10 +625,10 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
     cdef int pdist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
-        const int32_t[:] x1_indptr,
+        const int32_t[::1] x1_indices,
+        const int32_t[::1] x1_indptr,
         const intp_t size,
-        float64_t[:, ::1] D,
+        {{INPUT_DTYPE_t}}[:, ::1] D,
     ) except -1 nogil:
         """Pairwise distances between rows in CSR matrix X.
 
@@ -523,9 +648,9 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
                 x2_end = x1_indptr[i2 + 1]
                 D[i1, i2] = D[i2, i1] = self.dist_csr(
                     x1_data,
-                    x1_indices,
+                    &x1_indices[0],
                     x1_data,
-                    x1_indices,
+                    &x1_indices[0],
                     x1_start,
                     x1_end,
                     x2_start,
@@ -537,13 +662,13 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
     cdef int cdist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
-        const int32_t[:] x1_indptr,
+        const int32_t[::1] x1_indices,
+        const int32_t[::1] x1_indptr,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
-        const int32_t[:] x2_indptr,
+        const int32_t[::1] x2_indices,
+        const int32_t[::1] x2_indptr,
         const intp_t size,
-        float64_t[:, ::1] D,
+        {{INPUT_DTYPE_t}}[:, ::1] D,
     ) except -1 nogil:
         """Compute the cross-pairwise distances between arrays X and Y
         represented in the CSR format."""
@@ -562,9 +687,9 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
 
                 D[i1, i2] = self.dist_csr(
                     x1_data,
-                    x1_indices,
+                    &x1_indices[0],
                     x2_data,
-                    x2_indices,
+                    &x2_indices[0],
                     x1_start,
                     x1_end,
                     x2_start,
@@ -573,11 +698,11 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
                 )
         return 0
 
-    cdef float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
+    cdef {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
         """Convert the rank-preserving surrogate distance to the distance"""
         return rdist
 
-    cdef float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
+    cdef {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
         """Convert the distance to the rank-preserving surrogate distance"""
         return dist
 
@@ -624,33 +749,33 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
     def _pairwise_dense_dense(self, X, Y):
         cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr
         cdef const {{INPUT_DTYPE_t}}[:, ::1] Yarr
-        cdef float64_t[:, ::1] Darr
+        cdef {{INPUT_DTYPE_t}}[:, ::1] Darr
 
         Xarr = np.asarray(X, dtype={{INPUT_DTYPE}}, order='C')
         self._validate_data(Xarr)
         if X is Y:
-            Darr = np.empty((Xarr.shape[0], Xarr.shape[0]), dtype=np.float64, order='C')
+            Darr = np.empty((Xarr.shape[0], Xarr.shape[0]), dtype={{INPUT_DTYPE}}, order='C')
             self.pdist(Xarr, Darr)
         else:
             Yarr = np.asarray(Y, dtype={{INPUT_DTYPE}}, order='C')
             self._validate_data(Yarr)
-            Darr = np.empty((Xarr.shape[0], Yarr.shape[0]), dtype=np.float64, order='C')
+            Darr = np.empty((Xarr.shape[0], Yarr.shape[0]), dtype={{INPUT_DTYPE}}, order='C')
             self.cdist(Xarr, Yarr, Darr)
         return np.asarray(Darr)
 
     def _pairwise_sparse_sparse(self, X: csr_matrix , Y: csr_matrix):
         cdef:
             intp_t n_X, n_features
-            const {{INPUT_DTYPE_t}}[:] X_data
-            const int32_t[:] X_indices
-            const int32_t[:] X_indptr
+            const {{INPUT_DTYPE_t}}[::1] X_data
+            const int32_t[::1] X_indices
+            const int32_t[::1] X_indptr
 
             intp_t n_Y
-            const {{INPUT_DTYPE_t}}[:] Y_data
-            const int32_t[:] Y_indices
-            const int32_t[:] Y_indptr
+            const {{INPUT_DTYPE_t}}[::1] Y_data
+            const int32_t[::1] Y_indices
+            const int32_t[::1] Y_indptr
 
-            float64_t[:, ::1] Darr
+            {{INPUT_DTYPE_t}}[:, ::1] Darr
 
         X_csr = X.tocsr()
         n_X, n_features = X_csr.shape
@@ -658,7 +783,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
         X_indices = np.asarray(X_csr.indices, dtype=np.int32)
         X_indptr = np.asarray(X_csr.indptr, dtype=np.int32)
         if X is Y:
-            Darr = np.empty((n_X, n_X), dtype=np.float64, order='C')
+            Darr = np.empty((n_X, n_X), dtype={{INPUT_DTYPE}}, order='C')
             self.pdist_csr(
                 x1_data=&X_data[0],
                 x1_indices=X_indices,
@@ -673,7 +798,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
             Y_indices = np.asarray(Y_csr.indices, dtype=np.int32)
             Y_indptr = np.asarray(Y_csr.indptr, dtype=np.int32)
 
-            Darr = np.empty((n_X, n_Y), dtype=np.float64, order='C')
+            Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C')
             self.cdist_csr(
                 x1_data=&X_data[0],
                 x1_indices=X_indices,
@@ -690,13 +815,13 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
         cdef:
             intp_t n_X = X.shape[0]
             intp_t n_features = X.shape[1]
-            const {{INPUT_DTYPE_t}}[:] X_data = np.asarray(
+            const {{INPUT_DTYPE_t}}[::1] X_data = np.asarray(
                 X.data, dtype={{INPUT_DTYPE}},
             )
-            const int32_t[:] X_indices = np.asarray(
+            const int32_t[::1] X_indices = np.asarray(
                 X.indices, dtype=np.int32,
             )
-            const int32_t[:] X_indptr = np.asarray(
+            const int32_t[::1] X_indptr = np.asarray(
                 X.indptr, dtype=np.int32,
             )
 
@@ -704,11 +829,11 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
                 Y, dtype={{INPUT_DTYPE}}, order="C",
             )
             intp_t n_Y = Y_data.shape[0]
-            const int32_t[:] Y_indices = (
+            const int32_t[::1] Y_indices = (
                 np.arange(n_features, dtype=np.int32)
             )
 
-            float64_t[:, ::1] Darr = np.empty((n_X, n_Y), dtype=np.float64, order='C')
+            {{INPUT_DTYPE_t}}[:, ::1] Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C')
 
             intp_t i1, i2
             intp_t x1_start, x1_end
@@ -735,9 +860,9 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
 
                     Darr[i1, i2] = self.dist_csr(
                         x1_data=&X_data[0],
-                        x1_indices=X_indices,
+                        x1_indices=&X_indices[0],
                         x2_data=x2_data,
-                        x2_indices=Y_indices,
+                        x2_indices=&Y_indices[0],
                         x1_start=x1_start,
                         x1_end=x1_end,
                         x2_start=0,
@@ -758,22 +883,22 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
             const {{INPUT_DTYPE_t}}[:, ::1] X_data = np.asarray(
                 X, dtype={{INPUT_DTYPE}}, order="C",
             )
-            const int32_t[:] X_indices = np.arange(
+            const int32_t[::1] X_indices = np.arange(
                 n_features, dtype=np.int32,
             )
 
             intp_t n_Y = Y.shape[0]
-            const {{INPUT_DTYPE_t}}[:] Y_data = np.asarray(
+            const {{INPUT_DTYPE_t}}[::1] Y_data = np.asarray(
                 Y.data, dtype={{INPUT_DTYPE}},
             )
-            const int32_t[:] Y_indices = np.asarray(
+            const int32_t[::1] Y_indices = np.asarray(
                 Y.indices, dtype=np.int32,
             )
-            const int32_t[:] Y_indptr = np.asarray(
+            const int32_t[::1] Y_indptr = np.asarray(
                 Y.indptr, dtype=np.int32,
             )
 
-            float64_t[:, ::1] Darr = np.empty((n_X, n_Y), dtype=np.float64, order='C')
+            {{INPUT_DTYPE_t}}[:, ::1] Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C')
 
             intp_t i1, i2
             {{INPUT_DTYPE_t}} * x1_data
@@ -801,9 +926,9 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
 
                     Darr[i1, i2] = self.dist_csr(
                         x1_data=x1_data,
-                        x1_indices=X_indices,
+                        x1_indices=&X_indices[0],
                         x2_data=&Y_data[0],
-                        x2_indices=Y_indices,
+                        x2_indices=&Y_indices[0],
                         x1_start=0,
                         x1_end=n_features,
                         x2_start=x2_start,
@@ -867,24 +992,24 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     def __init__(self):
         self.p = 2
 
-    cdef inline float64_t dist(self,
+    cdef inline {{INPUT_DTYPE_t}} dist(self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
         intp_t size,
     ) except -1 nogil:
         return euclidean_dist{{name_suffix}}(x1, x2, size)
 
-    cdef inline float64_t rdist(self,
+    cdef inline {{INPUT_DTYPE_t}} rdist(self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
         intp_t size,
     ) except -1 nogil:
         return euclidean_rdist{{name_suffix}}(x1, x2, size)
 
-    cdef inline float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
+    cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
         return sqrt(rdist)
 
-    cdef inline float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
+    cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
         return dist * dist
 
     def rdist_to_dist(self, rdist):
@@ -893,12 +1018,12 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     def dist_to_rdist(self, dist):
         return dist ** 2
 
-    cdef inline float64_t rdist_csr(
+    cdef inline {{INPUT_DTYPE_t}} rdist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -945,12 +1070,12 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 
         return d
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -988,7 +1113,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
         if X.shape[1] != self.size:
             raise ValueError('SEuclidean dist: size of V does not match')
 
-    cdef inline float64_t rdist(
+    cdef inline {{INPUT_DTYPE_t}} rdist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -1001,7 +1126,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             d += (tmp * tmp / self.vec[j])
         return d
 
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -1009,10 +1134,10 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     ) except -1 nogil:
         return sqrt(self.rdist(x1, x2, size))
 
-    cdef inline float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
+    cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
         return sqrt(rdist)
 
-    cdef inline float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
+    cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
         return dist * dist
 
     def rdist_to_dist(self, rdist):
@@ -1021,12 +1146,12 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     def dist_to_rdist(self, dist):
         return dist ** 2
 
-    cdef inline float64_t rdist_csr(
+    cdef inline {{INPUT_DTYPE_t}} rdist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -1074,12 +1199,12 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
                 i1 = i1 + 1
         return d
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -1111,7 +1236,7 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     def __init__(self):
         self.p = 1
 
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -1123,12 +1248,12 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             d += fabs(x1[j] - x2[j])
         return d
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -1141,7 +1266,7 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             intp_t i1 = x1_start
             intp_t i2 = x2_start
 
-            float64_t d = 0.0
+            {{INPUT_DTYPE_t}} d = 0.0
 
         while i1 < x1_end and i2 < x2_end:
             ix1 = x1_indices[i1]
@@ -1194,7 +1319,7 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     def __init__(self):
         self.p = INF{{name_suffix}}
 
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -1207,12 +1332,12 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
         return d
 
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -1271,19 +1396,27 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 
     Parameters
     ----------
-    p : int
+    p : float
         The order of the p-norm of the difference (see above).
+
+        .. versionchanged:: 1.4.0
+            Minkowski distance allows `p` to be `0<p<1`.
+
+
     w : (N,) array-like (optional)
         The weight vector.
 
-    Minkowski Distance requires p >= 1 and finite. For p = infinity,
-    use ChebyshevDistance.
+    Minkowski Distance requires p > 0 and finite.
+    When :math:`p \in (0,1)`, it isn't a true metric but is permissible when
+    the triangular inequality isn't necessary.
+    For p = infinity, use ChebyshevDistance.
     Note that for p=1, ManhattanDistance is more efficient, and for
     p=2, EuclideanDistance is more efficient.
+
     """
     def __init__(self, p, w=None):
-        if p < 1:
-            raise ValueError("p must be greater than 1")
+        if p <= 0:
+            raise ValueError("p must be greater than 0")
         elif np.isinf(p):
             raise ValueError("MinkowskiDistance requires finite p. "
                              "For p=inf, use ChebyshevDistance.")
@@ -1307,7 +1440,7 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
                              f"the number of features ({X.shape[1]}). "
                              f"Currently len(w)={self.size}.")
 
-    cdef inline float64_t rdist(
+    cdef inline {{INPUT_DTYPE_t}} rdist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -1324,7 +1457,7 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
                 d += (pow(fabs(x1[j] - x2[j]), self.p))
         return d
 
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -1332,10 +1465,10 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     ) except -1 nogil:
         return pow(self.rdist(x1, x2, size), 1. / self.p)
 
-    cdef inline float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
+    cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
         return pow(rdist, 1. / self.p)
 
-    cdef inline float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
+    cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
         return pow(dist, self.p)
 
     def rdist_to_dist(self, rdist):
@@ -1344,12 +1477,12 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     def dist_to_rdist(self, dist):
         return dist ** self.p
 
-    cdef inline float64_t rdist_csr(
+    cdef inline {{INPUT_DTYPE_t}} rdist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -1424,12 +1557,12 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 
             return d
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -1496,7 +1629,7 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
         if X.shape[1] != self.size:
             raise ValueError('Mahalanobis dist: size of V does not match')
 
-    cdef inline float64_t rdist(
+    cdef inline {{INPUT_DTYPE_t}} rdist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -1516,7 +1649,7 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             d += tmp * self.buffer[i]
         return d
 
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -1524,10 +1657,10 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     ) except -1 nogil:
         return sqrt(self.rdist(x1, x2, size))
 
-    cdef inline float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
+    cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
         return sqrt(rdist)
 
-    cdef inline float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
+    cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
         return dist * dist
 
     def rdist_to_dist(self, rdist):
@@ -1536,12 +1669,12 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     def dist_to_rdist(self, dist):
         return dist ** 2
 
-    cdef inline float64_t rdist_csr(
+    cdef inline {{INPUT_DTYPE_t}} rdist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -1590,12 +1723,12 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 
         return d
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -1627,7 +1760,7 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     .. math::
        D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i}
     """
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -1641,12 +1774,12 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
         return float(n_unequal) / size
 
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -1702,7 +1835,7 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     .. math::
        D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|}
     """
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -1716,12 +1849,12 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
                 d += fabs(x1[j] - x2[j]) / denom
         return d
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -1777,7 +1910,7 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     .. math::
        D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)}
     """
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -1793,12 +1926,12 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
         else:
             return 0.0
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -1857,7 +1990,7 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 
         D(x, y) = (N_TF + N_FT) / (N_TT + N_TF + N_FT)
     """
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -1877,12 +2010,12 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             return 0
         return (nnz - n_eq) * 1.0 / nnz
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -1946,7 +2079,7 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 
         D(x, y) = (N_TF + N_FT) / N
     """
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -1960,12 +2093,12 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             n_neq += (tf1 != tf2)
         return n_neq * 1. / size
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -2021,7 +2154,7 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
         D(x, y) = (N_TF + N_FT) / (2 * N_TT + N_TF + N_FT)
 
     """
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -2036,12 +2169,12 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             n_neq += (tf1 != tf2)
         return n_neq / (2.0 * n_tt + n_neq)
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -2102,7 +2235,7 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
         D(x, y) = 1 - N_TT / (N + N_TF + N_FT)
 
     """
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -2117,12 +2250,12 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             n_tt += (tf1 and tf2)
         return (n_neq - n_tt + size) * 1.0 / (n_neq + size)
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -2181,7 +2314,7 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 
         D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT)
     """
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -2195,12 +2328,12 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             n_neq += (tf1 != tf2)
         return (2.0 * n_neq) / (size + n_neq)
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -2258,7 +2391,7 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 
         D(x, y) = (N - N_TT) / N
     """
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -2272,12 +2405,12 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             n_tt += (tf1 and tf2)
         return (size - n_tt) * 1. / size
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -2328,7 +2461,7 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 
         D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT)
     """
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -2342,12 +2475,12 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             n_neq += (tf1 != tf2)
         return (2.0 * n_neq) / (size + n_neq)
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -2405,7 +2538,7 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 
         D(x, y) = (N_TF + N_FT) / (N_TT / 2 + N_FT + N_TF)
     """
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -2420,12 +2553,12 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             n_tt += (tf1 and tf2)
         return n_neq / (0.5 * n_tt + n_neq)
 
-    cdef inline float64_t dist_csr(
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -2494,7 +2627,7 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             raise ValueError("Haversine distance only valid "
                              "in 2 dimensions")
 
-    cdef inline float64_t rdist(self,
+    cdef inline {{INPUT_DTYPE_t}} rdist(self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
         intp_t size,
@@ -2503,17 +2636,17 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
         cdef float64_t sin_1 = sin(0.5 * ((x1[1]) - (x2[1])))
         return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1)
 
-    cdef inline float64_t dist(self,
+    cdef inline {{INPUT_DTYPE_t}} dist(self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
         intp_t size,
     ) except -1 nogil:
         return 2 * asin(sqrt(self.rdist(x1, x2, size)))
 
-    cdef inline float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
+    cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil:
         return 2 * asin(sqrt(rdist))
 
-    cdef inline float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
+    cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil:
         cdef float64_t tmp = sin(0.5 *  dist)
         return tmp * tmp
 
@@ -2524,17 +2657,17 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
         tmp = np.sin(0.5 * dist)
         return tmp * tmp
 
-    cdef inline float64_t dist_csr(
-         self,
-         const {{INPUT_DTYPE_t}}* x1_data,
-         const int32_t[:] x1_indices,
-         const {{INPUT_DTYPE_t}}* x2_data,
-         const int32_t[:] x2_indices,
-         const int32_t x1_start,
-         const int32_t x1_end,
-         const int32_t x2_start,
-         const int32_t x2_end,
-         const intp_t size,
+    cdef inline {{INPUT_DTYPE_t}} dist_csr(
+        self,
+        const {{INPUT_DTYPE_t}}* x1_data,
+        const int32_t* x1_indices,
+        const {{INPUT_DTYPE_t}}* x2_data,
+        const int32_t* x2_indices,
+        const int32_t x1_start,
+        const int32_t x1_end,
+        const int32_t x2_start,
+        const int32_t x2_end,
+        const intp_t size,
     ) except -1 nogil:
         return 2 * asin(sqrt(self.rdist_csr(
             x1_data,
@@ -2548,12 +2681,12 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             size,
         )))
 
-    cdef inline float64_t rdist_csr(
+    cdef inline {{INPUT_DTYPE_t}} rdist_csr(
         self,
         const {{INPUT_DTYPE_t}}* x1_data,
-        const int32_t[:] x1_indices,
+        const int32_t* x1_indices,
         const {{INPUT_DTYPE_t}}* x2_data,
-        const int32_t[:] x2_indices,
+        const int32_t* x2_indices,
         const int32_t x1_start,
         const int32_t x1_end,
         const int32_t x2_start,
@@ -2640,7 +2773,7 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The
     # only way to be back compatible is to inherit `dist` from the base class
     # without GIL and called an inline `_dist` which acquire GIL.
-    cdef inline float64_t dist(
+    cdef inline {{INPUT_DTYPE_t}} dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
@@ -2648,7 +2781,7 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     ) except -1 nogil:
         return self._dist(x1, x2, size)
 
-    cdef inline float64_t _dist(
+    cdef inline {{INPUT_DTYPE_t}} _dist(
         self,
         const {{INPUT_DTYPE_t}}* x1,
         const {{INPUT_DTYPE_t}}* x2,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index 7edc64c59a050..dd66299223efe 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -36,7 +36,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
         X,
         Y,
         intp_t k,
-        str metric="euclidean",
+        metric="euclidean",
         chunk_size=None,
         dict metric_kwargs=None,
         str strategy=None,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp
index 3d0ea84b0091d..f9719f6959dfc 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp
@@ -8,13 +8,7 @@ from ...utils._typedefs cimport intp_t, float64_t
 import numpy as np
 from scipy.sparse import issparse
 from sklearn.utils.fixes import threadpool_limits
-
-cpdef enum WeightingStrategy:
-    uniform = 0
-    # TODO: Implement the following options, most likely in
-    # `weighted_histogram_mode`
-    distance = 1
-    callable = 2
+from ._classmode cimport WeightingStrategy
 
 {{for name_suffix in ["32", "64"]}}
 from ._argkmin cimport ArgKmin{{name_suffix}}
@@ -25,8 +19,8 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}):
     {{name_suffix}}bit implementation of ArgKminClassMode.
     """
     cdef:
-        const intp_t[:] class_membership,
-        const intp_t[:] unique_labels
+        const intp_t[:] Y_labels,
+        const intp_t[:] unique_Y_labels
         float64_t[:, :] class_scores
         cpp_map[intp_t, intp_t] labels_to_index
         WeightingStrategy weight_type
@@ -38,14 +32,14 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}):
         Y,
         intp_t k,
         weights,
-        class_membership,
-        unique_labels,
+        Y_labels,
+        unique_Y_labels,
         str metric="euclidean",
         chunk_size=None,
         dict metric_kwargs=None,
         str strategy=None,
     ):
-        """Compute the argkmin reduction with class_membership.
+        """Compute the argkmin reduction with Y_labels.
 
         This classmethod is responsible for introspecting the arguments
         values to dispatch to the most appropriate implementation of
@@ -66,8 +60,8 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}):
             chunk_size=chunk_size,
             strategy=strategy,
             weights=weights,
-            class_membership=class_membership,
-            unique_labels=unique_labels,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
         )
 
         # Limit the number of threads in second level of nested parallelism for BLAS
@@ -83,8 +77,8 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}):
     def __init__(
         self,
         DatasetsPair{{name_suffix}} datasets_pair,
-        const intp_t[:] class_membership,
-        const intp_t[:] unique_labels,
+        const intp_t[:] Y_labels,
+        const intp_t[:] unique_Y_labels,
         chunk_size=None,
         strategy=None,
         intp_t k=1,
@@ -103,15 +97,15 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}):
             self.weight_type = WeightingStrategy.distance
         else:
             self.weight_type = WeightingStrategy.callable
-        self.class_membership = class_membership
+        self.Y_labels = Y_labels
 
-        self.unique_labels = unique_labels
+        self.unique_Y_labels = unique_Y_labels
 
         cdef intp_t idx, neighbor_class_idx
         # Map from set of unique labels to their indices in `class_scores`
         # Buffer used in building a histogram for one-pass weighted mode
         self.class_scores = np.zeros(
-            (self.n_samples_X, unique_labels.shape[0]), dtype=np.float64,
+            (self.n_samples_X, unique_Y_labels.shape[0]), dtype=np.float64,
         )
 
     def _finalize_results(self):
@@ -142,7 +136,7 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}):
             if use_distance_weighting:
                 score_incr = 1 / distances[neighbor_rank]
             neighbor_idx = indices[neighbor_rank]
-            neighbor_class_idx = self.class_membership[neighbor_idx]
+            neighbor_class_idx = self.Y_labels[neighbor_idx]
             self.class_scores[sample_index][neighbor_class_idx] += score_incr
         return
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd b/sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd
new file mode 100644
index 0000000000000..65db044d668e8
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd
@@ -0,0 +1,5 @@
+cpdef enum WeightingStrategy:
+    uniform = 0
+    # TODO: Implement the following options in weighted_histogram_mode
+    distance = 1
+    callable = 2
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp
index fc56a59cab16f..1e57b3291a8f4 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp
@@ -38,22 +38,22 @@ cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
 cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
     cdef:
         const {{INPUT_DTYPE_t}}[:] X_data
-        const int32_t[:] X_indices
-        const int32_t[:] X_indptr
+        const int32_t[::1] X_indices
+        const int32_t[::1] X_indptr
 
         const {{INPUT_DTYPE_t}}[:] Y_data
-        const int32_t[:] Y_indices
-        const int32_t[:] Y_indptr
+        const int32_t[::1] Y_indices
+        const int32_t[::1] Y_indptr
 
 
 cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
     cdef:
         const {{INPUT_DTYPE_t}}[:] X_data
-        const int32_t[:] X_indices
-        const int32_t[:] X_indptr
+        const int32_t[::1] X_indices
+        const int32_t[::1] X_indptr
 
         const {{INPUT_DTYPE_t}}[:] Y_data
-        const int32_t[:] Y_indices
+        const int32_t[::1] Y_indices
         intp_t n_Y
 
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp
index 40a9a45e8b8e1..2c3ca44047145 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp
@@ -1,3 +1,5 @@
+import copy
+
 {{py:
 
 implementation_specific_values = [
@@ -53,7 +55,7 @@ cdef class DatasetsPair{{name_suffix}}:
         cls,
         X,
         Y,
-        str metric="euclidean",
+        metric="euclidean",
         dict metric_kwargs=None,
     ) -> DatasetsPair{{name_suffix}}:
         """Return the DatasetsPair implementation for the given arguments.
@@ -70,7 +72,7 @@ cdef class DatasetsPair{{name_suffix}}:
             If provided as a ndarray, it must be C-contiguous.
             If provided as a sparse matrix, it must be in CSR format.
 
-        metric : str, default='euclidean'
+        metric : str or DistanceMetric object, default='euclidean'
             The distance metric to compute between rows of X and Y.
             The default metric is a fast implementation of the Euclidean
             metric. For a list of available metrics, see the documentation
@@ -84,12 +86,17 @@ cdef class DatasetsPair{{name_suffix}}:
         datasets_pair: DatasetsPair{{name_suffix}}
             The suited DatasetsPair{{name_suffix}} implementation.
         """
-        # Y_norm_squared might be propagated down to DatasetsPairs
-        # via metrics_kwargs when the Euclidean specialisations
-        # can't be used. To prevent Y_norm_squared to be passed
+        # X_norm_squared and Y_norm_squared might be propagated
+        # down to DatasetsPairs via metrics_kwargs when the Euclidean
+        # specialisations can't be used.
+        # To prevent X_norm_squared and Y_norm_squared to be passed
         # down to DistanceMetrics (whose constructors would raise
-        # a RuntimeError), we pop it here.
+        # a RuntimeError), we pop them here.
         if metric_kwargs is not None:
+            # Copying metric_kwargs not to pop "X_norm_squared"
+            # and "Y_norm_squared" where they are used
+            metric_kwargs = copy.copy(metric_kwargs)
+            metric_kwargs.pop("X_norm_squared", None)
             metric_kwargs.pop("Y_norm_squared", None)
         cdef:
             {{DistanceMetric}} distance_metric = DistanceMetric.get_metric(
@@ -231,9 +238,9 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
     cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
         return self.distance_metric.rdist_csr(
             x1_data=&self.X_data[0],
-            x1_indices=self.X_indices,
+            x1_indices=&self.X_indices[0],
             x2_data=&self.Y_data[0],
-            x2_indices=self.Y_indices,
+            x2_indices=&self.Y_indices[0],
             x1_start=self.X_indptr[i],
             x1_end=self.X_indptr[i + 1],
             x2_start=self.Y_indptr[j],
@@ -245,9 +252,9 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
     cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil:
         return self.distance_metric.dist_csr(
             x1_data=&self.X_data[0],
-            x1_indices=self.X_indices,
+            x1_indices=&self.X_indices[0],
             x2_data=&self.Y_data[0],
-            x2_indices=self.Y_indices,
+            x2_indices=&self.Y_indices[0],
             x1_start=self.X_indptr[i],
             x1_end=self.X_indptr[i + 1],
             x2_start=self.Y_indptr[j],
@@ -324,11 +331,11 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
     cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
         return self.distance_metric.rdist_csr(
             x1_data=&self.X_data[0],
-            x1_indices=self.X_indices,
+            x1_indices=&self.X_indices[0],
             # Increment the data pointer such that x2_start=0 is aligned with the
             # j-th row
             x2_data=&self.Y_data[0] + j * self.n_features,
-            x2_indices=self.Y_indices,
+            x2_indices=&self.Y_indices[0],
             x1_start=self.X_indptr[i],
             x1_end=self.X_indptr[i + 1],
             x2_start=0,
@@ -341,11 +348,11 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
 
         return self.distance_metric.dist_csr(
             x1_data=&self.X_data[0],
-            x1_indices=self.X_indices,
+            x1_indices=&self.X_indices[0],
             # Increment the data pointer such that x2_start=0 is aligned with the
             # j-th row
             x2_data=&self.Y_data[0] + j * self.n_features,
-            x2_indices=self.Y_indices,
+            x2_indices=&self.Y_indices[0],
             x1_start=self.X_indptr[i],
             x1_end=self.X_indptr[i + 1],
             x2_start=0,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
index 796f15ab6fca0..e23da467d723a 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
+++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -5,7 +5,11 @@
 from scipy.sparse import issparse
 
 from ... import get_config
-from .._dist_metrics import BOOL_METRICS, METRIC_MAPPING64
+from .._dist_metrics import (
+    BOOL_METRICS,
+    METRIC_MAPPING64,
+    DistanceMetric,
+)
 from ._argkmin import (
     ArgKmin32,
     ArgKmin64,
@@ -117,7 +121,7 @@ def is_valid_sparse_matrix(X):
             and (is_numpy_c_ordered(Y) or is_valid_sparse_matrix(Y))
             and X.dtype == Y.dtype
             and X.dtype in (np.float32, np.float64)
-            and metric in cls.valid_metrics()
+            and (metric in cls.valid_metrics() or isinstance(metric, DistanceMetric))
         )
 
         return is_usable
@@ -456,7 +460,7 @@ def is_usable_for(cls, X, Y, metric) -> bool:
             The input array to be labelled.
 
         Y : ndarray of shape (n_samples_Y, n_features)
-            The input array whose labels are provided through the `labels`
+            The input array whose labels are provided through the `Y_labels`
             parameter.
 
         metric : str, default='euclidean'
@@ -484,8 +488,8 @@ def compute(
         Y,
         k,
         weights,
-        labels,
-        unique_labels,
+        Y_labels,
+        unique_Y_labels,
         metric="euclidean",
         chunk_size=None,
         metric_kwargs=None,
@@ -499,23 +503,23 @@ def compute(
             The input array to be labelled.
 
         Y : ndarray of shape (n_samples_Y, n_features)
-            The input array whose labels are provided through the `labels`
-            parameter.
+            The input array whose class membership are provided through the
+            `Y_labels` parameter.
 
         k : int
             The number of nearest neighbors to consider.
 
         weights : ndarray
-            The weights applied over the `labels` of `Y` when computing the
+            The weights applied over the `Y_labels` of `Y` when computing the
             weighted mode of the labels.
 
-        class_membership : ndarray
+        Y_labels : ndarray
             An array containing the index of the class membership of the
             associated samples in `Y`. This is used in labeling `X`.
 
-        unique_classes : ndarray
+        unique_Y_labels : ndarray
             An array containing all unique indices contained in the
-            corresponding `class_membership` array.
+            corresponding `Y_labels` array.
 
         metric : str, default='euclidean'
             The distance metric to use. For a list of available metrics, see
@@ -587,8 +591,8 @@ def compute(
                 Y=Y,
                 k=k,
                 weights=weights,
-                class_membership=np.array(labels, dtype=np.intp),
-                unique_labels=np.array(unique_labels, dtype=np.intp),
+                Y_labels=np.array(Y_labels, dtype=np.intp),
+                unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
                 metric=metric,
                 chunk_size=chunk_size,
                 metric_kwargs=metric_kwargs,
@@ -601,8 +605,8 @@ def compute(
                 Y=Y,
                 k=k,
                 weights=weights,
-                class_membership=np.array(labels, dtype=np.intp),
-                unique_labels=np.array(unique_labels, dtype=np.intp),
+                Y_labels=np.array(Y_labels, dtype=np.intp),
+                unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
                 metric=metric,
                 chunk_size=chunk_size,
                 metric_kwargs=metric_kwargs,
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index aee1615c55630..302831366aa54 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -124,7 +124,7 @@ def __call__(self, estimator, *args, **kwargs):
         cached_call = partial(_cached_call, cache)
 
         if _routing_enabled():
-            routed_params = process_routing(self, "score", kwargs)
+            routed_params = process_routing(self, "score", **kwargs)
         else:
             # they all get the same args, and they all get them all
             routed_params = Bunch(
@@ -293,6 +293,13 @@ def set_score_request(self, **kwargs):
             Arguments should be of the form ``param_name=alias``, and `alias`
             can be one of ``{True, False, None, str}``.
         """
+        if not _routing_enabled():
+            raise RuntimeError(
+                "This method is only available when metadata routing is enabled."
+                " You can enable it using"
+                " sklearn.set_config(enable_metadata_routing=True)."
+            )
+
         self._warn_overlap(
             message=(
                 "You are setting metadata request for parameters which are "
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index a05a532ecb3f2..cfcb08a312443 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -159,10 +159,10 @@ def test_classification_report_dictionary_output():
             for metric in expected_report[key]:
                 assert_almost_equal(expected_report[key][metric], report[key][metric])
 
-    assert type(expected_report["setosa"]["precision"]) == float
-    assert type(expected_report["macro avg"]["precision"]) == float
-    assert type(expected_report["setosa"]["support"]) == int
-    assert type(expected_report["macro avg"]["support"]) == int
+    assert isinstance(expected_report["setosa"]["precision"], float)
+    assert isinstance(expected_report["macro avg"]["precision"], float)
+    assert isinstance(expected_report["setosa"]["support"], int)
+    assert isinstance(expected_report["macro avg"]["support"], int)
 
 
 def test_classification_report_output_dict_empty_input():
diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py
index 16aa5c569b161..7d44b988b9161 100644
--- a/sklearn/metrics/tests/test_dist_metrics.py
+++ b/sklearn/metrics/tests/test_dist_metrics.py
@@ -15,6 +15,7 @@
 )
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import assert_allclose, create_memmap_backed_data
+from sklearn.utils.fixes import parse_version, sp_version
 
 
 def dist_func(x1, x2, p):
@@ -42,18 +43,17 @@ def dist_func(x1, x2, p):
 V = rng.random_sample((d, d))
 VI = np.dot(V, V.T)
 
-
 METRICS_DEFAULT_PARAMS = [
     ("euclidean", {}),
     ("cityblock", {}),
-    ("minkowski", dict(p=(1, 1.5, 2, 3))),
+    ("minkowski", dict(p=(0.5, 1, 1.5, 2, 3))),
     ("chebyshev", {}),
     ("seuclidean", dict(V=(rng.random_sample(d),))),
     ("mahalanobis", dict(VI=(VI,))),
     ("hamming", {}),
     ("canberra", {}),
     ("braycurtis", {}),
-    ("minkowski", dict(p=(1, 1.5, 3), w=(rng.random_sample(d),))),
+    ("minkowski", dict(p=(0.5, 1, 1.5, 3), w=(rng.random_sample(d),))),
 ]
 
 
@@ -76,6 +76,13 @@ def test_cdist(metric_param_grid, X, Y):
             # with scipy
             rtol_dict = {"rtol": 1e-6}
 
+        # TODO: Remove when scipy minimum version >= 1.7.0
+        # scipy supports 0<p<1 for minkowski metric >= 1.7.0
+        if metric == "minkowski":
+            p = kwargs["p"]
+            if sp_version < parse_version("1.7.0") and p < 1:
+                pytest.skip("scipy does not support 0<p<1 for minkowski metric < 1.7.0")
+
         D_scipy_cdist = cdist(X, Y, metric, **kwargs)
 
         dm = DistanceMetric.get_metric(metric, X.dtype, **kwargs)
@@ -150,6 +157,12 @@ def test_pdist(metric_param_grid, X):
             # with scipy
             rtol_dict = {"rtol": 1e-6}
 
+        # TODO: Remove when scipy minimum version >= 1.7.0
+        # scipy supports 0<p<1 for minkowski metric >= 1.7.0
+        if metric == "minkowski":
+            p = kwargs["p"]
+            if sp_version < parse_version("1.7.0") and p < 1:
+                pytest.skip("scipy does not support 0<p<1 for minkowski metric < 1.7.0")
         D_scipy_pdist = cdist(X, X, metric, **kwargs)
 
         dm = DistanceMetric.get_metric(metric, X.dtype, **kwargs)
@@ -187,11 +200,8 @@ def test_distance_metrics_dtype_consistency(metric_param_grid):
         D64 = dm64.pairwise(X64)
         D32 = dm32.pairwise(X32)
 
-        # Both results are np.float64 dtype because the accumulation across
-        # features is done in float64. However the input data and the element
-        # wise arithmetic operations are done in float32 so we can expect a
-        # small discrepancy.
-        assert D64.dtype == D32.dtype == np.float64
+        assert D64.dtype == np.float64
+        assert D32.dtype == np.float32
 
         # assert_allclose introspects the dtype of the input arrays to decide
         # which rtol value to use by default but in this case we know that D32
@@ -397,3 +407,9 @@ def test_get_metric_bad_dtype():
     msg = r"Unexpected dtype .* provided. Please select a dtype from"
     with pytest.raises(ValueError, match=msg):
         DistanceMetric.get_metric("manhattan", dtype)
+
+
+def test_minkowski_metric_validate_bad_p_parameter():
+    msg = "p must be greater than 0"
+    with pytest.raises(ValueError, match=msg):
+        DistanceMetric.get_metric("minkowski", p=0)
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 5fcf980fbe39b..46405048a4fa1 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -649,8 +649,8 @@ def test_argkmin_classmode_factory_method_wrong_usages():
     metric = "manhattan"
 
     weights = "uniform"
-    labels = rng.randint(low=0, high=10, size=100)
-    unique_labels = np.unique(labels)
+    Y_labels = rng.randint(low=0, high=10, size=100)
+    unique_Y_labels = np.unique(Y_labels)
 
     msg = (
         "Only float64 or float32 datasets pairs are supported at this time, "
@@ -663,8 +663,8 @@ def test_argkmin_classmode_factory_method_wrong_usages():
             k=k,
             metric=metric,
             weights=weights,
-            labels=labels,
-            unique_labels=unique_labels,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
         )
 
     msg = (
@@ -678,8 +678,8 @@ def test_argkmin_classmode_factory_method_wrong_usages():
             k=k,
             metric=metric,
             weights=weights,
-            labels=labels,
-            unique_labels=unique_labels,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
         )
 
     with pytest.raises(ValueError, match="k == -1, must be >= 1."):
@@ -689,8 +689,8 @@ def test_argkmin_classmode_factory_method_wrong_usages():
             k=-1,
             metric=metric,
             weights=weights,
-            labels=labels,
-            unique_labels=unique_labels,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
         )
 
     with pytest.raises(ValueError, match="k == 0, must be >= 1."):
@@ -700,8 +700,8 @@ def test_argkmin_classmode_factory_method_wrong_usages():
             k=0,
             metric=metric,
             weights=weights,
-            labels=labels,
-            unique_labels=unique_labels,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
         )
 
     with pytest.raises(ValueError, match="Unrecognized metric"):
@@ -711,8 +711,8 @@ def test_argkmin_classmode_factory_method_wrong_usages():
             k=k,
             metric="wrong metric",
             weights=weights,
-            labels=labels,
-            unique_labels=unique_labels,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
         )
 
     with pytest.raises(
@@ -724,8 +724,8 @@ def test_argkmin_classmode_factory_method_wrong_usages():
             k=k,
             metric=metric,
             weights=weights,
-            labels=labels,
-            unique_labels=unique_labels,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
         )
 
     with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
@@ -735,8 +735,8 @@ def test_argkmin_classmode_factory_method_wrong_usages():
             k=k,
             metric=metric,
             weights=weights,
-            labels=labels,
-            unique_labels=unique_labels,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
         )
 
     non_existent_weights_strategy = "non_existent_weights_strategy"
@@ -751,8 +751,8 @@ def test_argkmin_classmode_factory_method_wrong_usages():
             k=k,
             metric=metric,
             weights=non_existent_weights_strategy,
-            labels=labels,
-            unique_labels=unique_labels,
+            Y_labels=Y_labels,
+            unique_Y_labels=unique_Y_labels,
         )
 
     # TODO: introduce assertions on UserWarnings once the Euclidean specialisation
@@ -1332,16 +1332,16 @@ def test_argkmin_classmode_strategy_consistent():
     metric = "manhattan"
 
     weights = "uniform"
-    labels = rng.randint(low=0, high=10, size=100)
-    unique_labels = np.unique(labels)
+    Y_labels = rng.randint(low=0, high=10, size=100)
+    unique_Y_labels = np.unique(Y_labels)
     results_X = ArgKminClassMode.compute(
         X=X,
         Y=Y,
         k=k,
         metric=metric,
         weights=weights,
-        labels=labels,
-        unique_labels=unique_labels,
+        Y_labels=Y_labels,
+        unique_Y_labels=unique_Y_labels,
         strategy="parallel_on_X",
     )
     results_Y = ArgKminClassMode.compute(
@@ -1350,8 +1350,8 @@ def test_argkmin_classmode_strategy_consistent():
         k=k,
         metric=metric,
         weights=weights,
-        labels=labels,
-        unique_labels=unique_labels,
+        Y_labels=Y_labels,
+        unique_Y_labels=unique_Y_labels,
         strategy="parallel_on_Y",
     )
     assert_array_equal(results_X, results_Y)
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 480b85db5ad05..10d991c477f1b 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -55,7 +55,7 @@
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.pipeline import make_pipeline
 from sklearn.svm import LinearSVC
-from sklearn.tests.test_metadata_routing import assert_request_is_empty
+from sklearn.tests.metadata_routing_common import assert_request_is_empty
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.utils._testing import (
     assert_almost_equal,
@@ -1197,6 +1197,15 @@ def test_scorer_no_op_multiclass_select_proba():
     scorer(lr, X_test, y_test)
 
 
+@pytest.mark.parametrize("name", get_scorer_names())
+def test_scorer_set_score_request_raises(name):
+    """Test that set_score_request is only available when feature flag is on."""
+    # Make sure they expose the routing methods.
+    scorer = get_scorer(name)
+    with pytest.raises(RuntimeError, match="This method is only available"):
+        scorer.set_score_request()
+
+
 @pytest.mark.usefixtures("enable_slep006")
 @pytest.mark.parametrize("name", get_scorer_names(), ids=get_scorer_names())
 def test_scorer_metadata_request(name):
@@ -1288,6 +1297,7 @@ def test_PassthroughScorer_metadata_request():
     assert scorer.get_metadata_routing().score.requests["sample_weight"] == "alias"
 
 
+@pytest.mark.usefixtures("enable_slep006")
 def test_multimetric_scoring_metadata_routing():
     # Test that _MultimetricScorer properly routes metadata.
     def score1(y_true, y_pred):
@@ -1320,12 +1330,12 @@ def score3(y_true, y_pred, sample_weight=None):
     # this should fail, because metadata routing is not enabled and w/o it we
     # don't support different metadata for different scorers.
     # TODO: remove when enable_metadata_routing is deprecated
-    with pytest.raises(TypeError, match="got an unexpected keyword argument"):
-        multi_scorer(clf, X, y, sample_weight=1)
+    with config_context(enable_metadata_routing=False):
+        with pytest.raises(TypeError, match="got an unexpected keyword argument"):
+            multi_scorer(clf, X, y, sample_weight=1)
 
     # This passes since routing is done.
-    with config_context(enable_metadata_routing=True):
-        multi_scorer(clf, X, y, sample_weight=1)
+    multi_scorer(clf, X, y, sample_weight=1)
 
 
 def test_kwargs_without_metadata_routing_error():
diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py
index e528d780bc6b5..09e3674a6779f 100644
--- a/sklearn/mixture/_gaussian_mixture.py
+++ b/sklearn/mixture/_gaussian_mixture.py
@@ -348,6 +348,61 @@ def _compute_precision_cholesky(covariances, covariance_type):
     return precisions_chol
 
 
+def _flipudlr(array):
+    """Reverse the rows and columns of an array."""
+    return np.flipud(np.fliplr(array))
+
+
+def _compute_precision_cholesky_from_precisions(precisions, covariance_type):
+    r"""Compute the Cholesky decomposition of precisions using precisions themselves.
+
+    As implemented in :func:`_compute_precision_cholesky`, the `precisions_cholesky_` is
+    an upper-triangular matrix for each Gaussian component, which can be expressed as
+    the $UU^T$ factorization of the precision matrix for each Gaussian component, where
+    $U$ is an upper-triangular matrix.
+
+    In order to use the Cholesky decomposition to get $UU^T$, the precision matrix
+    $\Lambda$ needs to be permutated such that its rows and columns are reversed, which
+    can be done by applying a similarity transformation with an exchange matrix $J$,
+    where the 1 elements reside on the anti-diagonal and all other elements are 0. In
+    particular, the Cholesky decomposition of the transformed precision matrix is
+    $J\Lambda J=LL^T$, where $L$ is a lower-triangular matrix. Because $\Lambda=UU^T$
+    and $J=J^{-1}=J^T$, the `precisions_cholesky_` for each Gaussian component can be
+    expressed as $JLJ$.
+
+    Refer to #26415 for details.
+
+    Parameters
+    ----------
+    precisions : array-like
+        The precision matrix of the current components.
+        The shape depends on the covariance_type.
+
+    covariance_type : {'full', 'tied', 'diag', 'spherical'}
+        The type of precision matrices.
+
+    Returns
+    -------
+    precisions_cholesky : array-like
+        The cholesky decomposition of sample precisions of the current
+        components. The shape depends on the covariance_type.
+    """
+    if covariance_type == "full":
+        precisions_cholesky = np.array(
+            [
+                _flipudlr(linalg.cholesky(_flipudlr(precision), lower=True))
+                for precision in precisions
+            ]
+        )
+    elif covariance_type == "tied":
+        precisions_cholesky = _flipudlr(
+            linalg.cholesky(_flipudlr(precisions), lower=True)
+        )
+    else:
+        precisions_cholesky = np.sqrt(precisions)
+    return precisions_cholesky
+
+
 ###############################################################################
 # Gaussian mixture probability estimators
 def _compute_log_det_cholesky(matrix_chol, covariance_type, n_features):
@@ -699,6 +754,19 @@ def _check_parameters(self, X):
                 n_features,
             )
 
+    def _initialize_parameters(self, X, random_state):
+        # If all the initial parameters are all provided, then there is no need to run
+        # the initialization.
+        compute_resp = (
+            self.weights_init is None
+            or self.means_init is None
+            or self.precisions_init is None
+        )
+        if compute_resp:
+            super()._initialize_parameters(X, random_state)
+        else:
+            self._initialize(X, None)
+
     def _initialize(self, X, resp):
         """Initialization of the Gaussian mixture parameters.
 
@@ -709,11 +777,13 @@ def _initialize(self, X, resp):
         resp : array-like of shape (n_samples, n_components)
         """
         n_samples, _ = X.shape
-
-        weights, means, covariances = _estimate_gaussian_parameters(
-            X, resp, self.reg_covar, self.covariance_type
-        )
-        weights /= n_samples
+        weights, means, covariances = None, None, None
+        if resp is not None:
+            weights, means, covariances = _estimate_gaussian_parameters(
+                X, resp, self.reg_covar, self.covariance_type
+            )
+            if self.weights_init is None:
+                weights /= n_samples
 
         self.weights_ = weights if self.weights_init is None else self.weights_init
         self.means_ = means if self.means_init is None else self.means_init
@@ -723,19 +793,10 @@ def _initialize(self, X, resp):
             self.precisions_cholesky_ = _compute_precision_cholesky(
                 covariances, self.covariance_type
             )
-        elif self.covariance_type == "full":
-            self.precisions_cholesky_ = np.array(
-                [
-                    linalg.cholesky(prec_init, lower=True)
-                    for prec_init in self.precisions_init
-                ]
-            )
-        elif self.covariance_type == "tied":
-            self.precisions_cholesky_ = linalg.cholesky(
-                self.precisions_init, lower=True
-            )
         else:
-            self.precisions_cholesky_ = np.sqrt(self.precisions_init)
+            self.precisions_cholesky_ = _compute_precision_cholesky_from_precisions(
+                self.precisions_init, self.covariance_type
+            )
 
     def _m_step(self, X, log_resp):
         """M step.
diff --git a/sklearn/mixture/tests/test_gaussian_mixture.py b/sklearn/mixture/tests/test_gaussian_mixture.py
index e3289a2d4d73d..e24a6af966374 100644
--- a/sklearn/mixture/tests/test_gaussian_mixture.py
+++ b/sklearn/mixture/tests/test_gaussian_mixture.py
@@ -8,11 +8,13 @@
 import sys
 import warnings
 from io import StringIO
+from unittest.mock import Mock
 
 import numpy as np
 import pytest
 from scipy import linalg, stats
 
+import sklearn
 from sklearn.cluster import KMeans
 from sklearn.covariance import EmpiricalCovariance
 from sklearn.datasets import make_spd_matrix
@@ -1326,6 +1328,58 @@ def test_gaussian_mixture_precisions_init_diag():
     )
 
 
+def _generate_data(seed, n_samples, n_features, n_components):
+    """Randomly generate samples and responsibilities."""
+    rs = np.random.RandomState(seed)
+    X = rs.random_sample((n_samples, n_features))
+    resp = rs.random_sample((n_samples, n_components))
+    resp /= resp.sum(axis=1)[:, np.newaxis]
+    return X, resp
+
+
+def _calculate_precisions(X, resp, covariance_type):
+    """Calculate precision matrix of X and its Cholesky decomposition
+    for the given covariance type.
+    """
+    reg_covar = 1e-6
+    weights, means, covariances = _estimate_gaussian_parameters(
+        X, resp, reg_covar, covariance_type
+    )
+    precisions_cholesky = _compute_precision_cholesky(covariances, covariance_type)
+
+    _, n_components = resp.shape
+    # Instantiate a `GaussianMixture` model in order to use its
+    # `_set_parameters` method to return the `precisions_` and
+    #  `precisions_cholesky_` from matching the `covariance_type`
+    # provided.
+    gmm = GaussianMixture(n_components=n_components, covariance_type=covariance_type)
+    params = (weights, means, covariances, precisions_cholesky)
+    gmm._set_parameters(params)
+    return gmm.precisions_, gmm.precisions_cholesky_
+
+
+@pytest.mark.parametrize("covariance_type", COVARIANCE_TYPE)
+def test_gaussian_mixture_precisions_init(covariance_type, global_random_seed):
+    """Non-regression test for #26415."""
+
+    X, resp = _generate_data(
+        seed=global_random_seed,
+        n_samples=100,
+        n_features=3,
+        n_components=4,
+    )
+
+    precisions_init, desired_precisions_cholesky = _calculate_precisions(
+        X, resp, covariance_type
+    )
+    gmm = GaussianMixture(
+        covariance_type=covariance_type, precisions_init=precisions_init
+    )
+    gmm._initialize(X, resp)
+    actual_precisions_cholesky = gmm.precisions_cholesky_
+    assert_allclose(actual_precisions_cholesky, desired_precisions_cholesky)
+
+
 def test_gaussian_mixture_single_component_stable():
     """
     Non-regression test for #23032 ensuring 1-component GM works on only a
@@ -1335,3 +1389,34 @@ def test_gaussian_mixture_single_component_stable():
     X = rng.multivariate_normal(np.zeros(2), np.identity(2), size=3)
     gm = GaussianMixture(n_components=1)
     gm.fit(X).sample()
+
+
+def test_gaussian_mixture_all_init_does_not_estimate_gaussian_parameters(
+    monkeypatch,
+    global_random_seed,
+):
+    """When all init parameters are provided, the Gaussian parameters
+    are not estimated.
+
+    Non-regression test for gh-26015.
+    """
+
+    mock = Mock(side_effect=_estimate_gaussian_parameters)
+    monkeypatch.setattr(
+        sklearn.mixture._gaussian_mixture, "_estimate_gaussian_parameters", mock
+    )
+
+    rng = np.random.RandomState(global_random_seed)
+    rand_data = RandomData(rng)
+
+    gm = GaussianMixture(
+        n_components=rand_data.n_components,
+        weights_init=rand_data.weights,
+        means_init=rand_data.means,
+        precisions_init=rand_data.precisions["full"],
+        random_state=rng,
+    )
+    gm.fit(rand_data.X["full"])
+    # The initial gaussian parameters are not estimated. They are estimated for every
+    # m_step.
+    assert mock.call_count == gm.n_iter_
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 12ce9cc0a9ad2..dcf0d2e41c905 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -816,6 +816,8 @@ def fit(self, X, y=None, *, groups=None, **fit_params):
         fit_and_score_kwargs = dict(
             scorer=scorers,
             fit_params=fit_params,
+            # TODO(SLEP6): pass score params along
+            score_params=None,
             return_train_score=self.return_train_score,
             return_n_test_samples=True,
             return_times=True,
diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py
index 38fe1e0e7a15c..708092d09a2a5 100644
--- a/sklearn/model_selection/_search_successive_halving.py
+++ b/sklearn/model_selection/_search_successive_halving.py
@@ -750,11 +750,13 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving):
         Either estimator needs to provide a ``score`` function,
         or ``scoring`` must be passed.
 
-    param_distributions : dict
-        Dictionary with parameters names (string) as keys and distributions
+    param_distributions : dict or list of dicts
+        Dictionary with parameters names (`str`) as keys and distributions
         or lists of parameters to try. Distributions must provide a ``rvs``
         method for sampling (such as those from scipy.stats.distributions).
         If a list is given, it is sampled uniformly.
+        If a list of dicts is given, first a dict is sampled uniformly, and
+        then a parameter is sampled using that dict as above.
 
     n_candidates : "exhaust" or int, default="exhaust"
         The number of candidate parameters to sample, at the first
@@ -1024,7 +1026,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving):
 
     _parameter_constraints: dict = {
         **BaseSuccessiveHalving._parameter_constraints,
-        "param_distributions": [dict],
+        "param_distributions": [dict, list],
         "n_candidates": [
             Interval(Integral, 0, None, closed="neither"),
             StrOptions({"exhaust"}),
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 178d0aa808790..2dceff9b22126 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -2673,7 +2673,7 @@ def _pprint(params, offset=0, printer=repr):
     this_line_length = offset
     line_sep = ",\n" + (1 + offset // 2) * " "
     for i, (k, v) in enumerate(sorted(params.items())):
-        if type(v) is float:
+        if isinstance(v, float):
             # use str for representing floating point numbers
             # this way we get consistent representation across
             # architectures and versions.
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 54fdabf90b4b6..f3c8735043408 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -25,11 +25,11 @@
 from joblib import logger
 
 from ..base import clone, is_classifier
-from ..exceptions import FitFailedWarning
+from ..exceptions import FitFailedWarning, UnsetMetadataPassedError
 from ..metrics import check_scoring, get_scorer_names
 from ..metrics._scorer import _check_multimetric_scoring, _MultimetricScorer
 from ..preprocessing import LabelEncoder
-from ..utils import _safe_indexing, check_random_state, indexable
+from ..utils import Bunch, _safe_indexing, check_random_state, indexable
 from ..utils._param_validation import (
     HasMethods,
     Integral,
@@ -37,6 +37,12 @@
     StrOptions,
     validate_params,
 )
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _routing_enabled,
+    process_routing,
+)
 from ..utils.metaestimators import _safe_split
 from ..utils.parallel import Parallel, delayed
 from ..utils.validation import _check_method_params, _num_samples
@@ -52,6 +58,40 @@
 ]
 
 
+def _check_params_groups_deprecation(fit_params, params, groups):
+    """A helper function to check deprecations on `groups` and `fit_params`.
+
+    To be removed when set_config(enable_metadata_routing=False) is not possible.
+    """
+    if params is not None and fit_params is not None:
+        raise ValueError(
+            "`params` and `fit_params` cannot both be provided. Pass parameters "
+            "via `params`. `fit_params` is deprecated and will be removed in "
+            "version 1.6."
+        )
+    elif fit_params is not None:
+        warnings.warn(
+            (
+                "`fit_params` is deprecated and will be removed in version 1.6. "
+                "Pass parameters via `params` instead."
+            ),
+            FutureWarning,
+        )
+        params = fit_params
+
+    params = {} if params is None else params
+
+    if groups is not None and _routing_enabled():
+        raise ValueError(
+            "`groups` can only be passed if metadata routing is not enabled via"
+            " `sklearn.set_config(enable_metadata_routing=True)`. When routing is"
+            " enabled, pass `groups` alongside other metadata via the `params` argument"
+            " instead."
+        )
+
+    return params
+
+
 @validate_params(
     {
         "estimator": [HasMethods("fit")],
@@ -70,6 +110,7 @@
         "n_jobs": [Integral, None],
         "verbose": ["verbose"],
         "fit_params": [dict, None],
+        "params": [dict, None],
         "pre_dispatch": [Integral, str],
         "return_train_score": ["boolean"],
         "return_estimator": ["boolean"],
@@ -89,6 +130,7 @@ def cross_validate(
     n_jobs=None,
     verbose=0,
     fit_params=None,
+    params=None,
     pre_dispatch="2*n_jobs",
     return_train_score=False,
     return_estimator=False,
@@ -116,6 +158,13 @@ def cross_validate(
         train/test set. Only used in conjunction with a "Group" :term:`cv`
         instance (e.g., :class:`GroupKFold`).
 
+        .. versionchanged:: 1.4
+            ``groups`` can only be passed if metadata routing is not enabled
+            via ``sklearn.set_config(enable_metadata_routing=True)``. When routing
+            is enabled, pass ``groups`` alongside other metadata via the ``params``
+            argument instead. E.g.:
+            ``cross_validate(..., params={'groups': groups})``.
+
     scoring : str, callable, list, tuple, or dict, default=None
         Strategy to evaluate the performance of the cross-validated model on
         the test set.
@@ -167,6 +216,16 @@ def cross_validate(
     fit_params : dict, default=None
         Parameters to pass to the fit method of the estimator.
 
+        .. deprecated:: 1.4
+            This parameter is deprecated and will be removed in version 1.6. Use
+            ``params`` instead.
+
+    params : dict, default=None
+        Parameters to pass to the underlying estimator's ``fit``, the scorer,
+        and the CV splitter.
+
+        .. versionadded:: 1.4
+
     pre_dispatch : int or str, default='2*n_jobs'
         Controls the number of jobs that get dispatched during parallel
         execution. Reducing this number can be useful to avoid an
@@ -287,7 +346,9 @@ def cross_validate(
     >>> print(scores['train_r2'])
     [0.28009951 0.3908844  0.22784907]
     """
-    X, y, groups = indexable(X, y, groups)
+    params = _check_params_groups_deprecation(fit_params, params, groups)
+
+    X, y = indexable(X, y)
 
     cv = check_cv(cv, y, classifier=is_classifier(estimator))
 
@@ -298,7 +359,62 @@ def cross_validate(
     else:
         scorers = _check_multimetric_scoring(estimator, scoring)
 
-    indices = cv.split(X, y, groups)
+    if _routing_enabled():
+        # `cross_validate` will create a `_MultiMetricScorer` if `scoring` is a
+        # dict at a later stage. We need the same object for the purpose of
+        # routing. However, creating it here and passing it around would create
+        # a much larger diff since the dict is used in many places.
+        if isinstance(scorers, dict):
+            _scorer = _MultimetricScorer(
+                scorers=scorers, raise_exc=(error_score == "raise")
+            )
+        else:
+            _scorer = scorers
+        # For estimators, a MetadataRouter is created in get_metadata_routing
+        # methods. For these router methods, we create the router to use
+        # `process_routing` on it.
+        router = (
+            MetadataRouter(owner="cross_validate")
+            .add(
+                splitter=cv,
+                method_mapping=MethodMapping().add(caller="fit", callee="split"),
+            )
+            .add(
+                estimator=estimator,
+                # TODO(SLEP6): also pass metadata to the predict method for
+                # scoring?
+                method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+            )
+            .add(
+                scorer=_scorer,
+                method_mapping=MethodMapping().add(caller="fit", callee="score"),
+            )
+        )
+        try:
+            routed_params = process_routing(router, "fit", **params)
+        except UnsetMetadataPassedError as e:
+            # The default exception would mention `fit` since in the above
+            # `process_routing` code, we pass `fit` as the caller. However,
+            # the user is not calling `fit` directly, so we change the message
+            # to make it more suitable for this case.
+            raise UnsetMetadataPassedError(
+                message=(
+                    f"{sorted(e.unrequested_params.keys())} are passed to cross"
+                    " validation but are not explicitly requested or unrequested. See"
+                    " the Metadata Routing User guide"
+                    " <https://scikit-learn.org/stable/metadata_routing.html> for more"
+                    " information."
+                ),
+                unrequested_params=e.unrequested_params,
+                routed_params=e.routed_params,
+            )
+    else:
+        routed_params = Bunch()
+        routed_params.splitter = Bunch(split={"groups": groups})
+        routed_params.estimator = Bunch(fit=params)
+        routed_params.scorer = Bunch(score={})
+
+    indices = cv.split(X, y, **routed_params.splitter.split)
     if return_indices:
         # materialize the indices since we need to store them in the returned dict
         indices = list(indices)
@@ -311,12 +427,13 @@ def cross_validate(
             clone(estimator),
             X,
             y,
-            scorers,
-            train,
-            test,
-            verbose,
-            None,
-            fit_params,
+            scorer=scorers,
+            train=train,
+            test=test,
+            verbose=verbose,
+            parameters=None,
+            fit_params=routed_params.estimator.fit,
+            score_params=routed_params.scorer.score,
             return_train_score=return_train_score,
             return_times=True,
             return_estimator=return_estimator,
@@ -436,6 +553,7 @@ def _warn_or_raise_about_fit_failures(results, error_score):
         "n_jobs": [Integral, None],
         "verbose": ["verbose"],
         "fit_params": [dict, None],
+        "params": [dict, None],
         "pre_dispatch": [Integral, str, None],
         "error_score": [StrOptions({"raise"}), Real],
     },
@@ -452,6 +570,7 @@ def cross_val_score(
     n_jobs=None,
     verbose=0,
     fit_params=None,
+    params=None,
     pre_dispatch="2*n_jobs",
     error_score=np.nan,
 ):
@@ -477,6 +596,13 @@ def cross_val_score(
         train/test set. Only used in conjunction with a "Group" :term:`cv`
         instance (e.g., :class:`GroupKFold`).
 
+        .. versionchanged:: 1.4
+            ``groups`` can only be passed if metadata routing is not enabled
+            via ``sklearn.set_config(enable_metadata_routing=True)``. When routing
+            is enabled, pass ``groups`` alongside other metadata via the ``params``
+            argument instead. E.g.:
+            ``cross_val_score(..., params={'groups': groups})``.
+
     scoring : str or callable, default=None
         A str (see model evaluation documentation) or
         a scorer callable object / function with signature
@@ -521,6 +647,16 @@ def cross_val_score(
     fit_params : dict, default=None
         Parameters to pass to the fit method of the estimator.
 
+        .. deprecated:: 1.4
+            This parameter is deprecated and will be removed in version 1.6. Use
+            ``params`` instead.
+
+    params : dict, default=None
+        Parameters to pass to the underlying estimator's ``fit``, the scorer,
+        and the CV splitter.
+
+        .. versionadded:: 1.4
+
     pre_dispatch : int or str, default='2*n_jobs'
         Controls the number of jobs that get dispatched during parallel
         execution. Reducing this number can be useful to avoid an
@@ -585,6 +721,7 @@ def cross_val_score(
         n_jobs=n_jobs,
         verbose=verbose,
         fit_params=fit_params,
+        params=params,
         pre_dispatch=pre_dispatch,
         error_score=error_score,
     )
@@ -595,12 +732,14 @@ def _fit_and_score(
     estimator,
     X,
     y,
+    *,
     scorer,
     train,
     test,
     verbose,
     parameters,
     fit_params,
+    score_params,
     return_train_score=False,
     return_parameters=False,
     return_n_test_samples=False,
@@ -654,6 +793,9 @@ def _fit_and_score(
     fit_params : dict or None
         Parameters that will be passed to ``estimator.fit``.
 
+    score_params : dict or None
+        Parameters that will be passed to the scorer.
+
     return_train_score : bool, default=False
         Compute and return score on training set.
 
@@ -724,6 +866,9 @@ def _fit_and_score(
     # Adjust length of sample weights
     fit_params = fit_params if fit_params is not None else {}
     fit_params = _check_method_params(X, params=fit_params, indices=train)
+    score_params = score_params if score_params is not None else {}
+    score_params_train = _check_method_params(X, params=score_params, indices=train)
+    score_params_test = _check_method_params(X, params=score_params, indices=test)
 
     if parameters is not None:
         # here we clone the parameters, since sometimes the parameters
@@ -764,10 +909,14 @@ def _fit_and_score(
         result["fit_error"] = None
 
         fit_time = time.time() - start_time
-        test_scores = _score(estimator, X_test, y_test, scorer, error_score)
+        test_scores = _score(
+            estimator, X_test, y_test, scorer, score_params_test, error_score
+        )
         score_time = time.time() - start_time - fit_time
         if return_train_score:
-            train_scores = _score(estimator, X_train, y_train, scorer, error_score)
+            train_scores = _score(
+                estimator, X_train, y_train, scorer, score_params_train, error_score
+            )
 
     if verbose > 1:
         total_time = score_time + fit_time
@@ -809,7 +958,7 @@ def _fit_and_score(
     return result
 
 
-def _score(estimator, X_test, y_test, scorer, error_score="raise"):
+def _score(estimator, X_test, y_test, scorer, score_params, error_score="raise"):
     """Compute the score(s) of an estimator on a given test set.
 
     Will return a dict of floats if `scorer` is a dict, otherwise a single
@@ -819,11 +968,13 @@ def _score(estimator, X_test, y_test, scorer, error_score="raise"):
         # will cache method calls if needed. scorer() returns a dict
         scorer = _MultimetricScorer(scorers=scorer, raise_exc=(error_score == "raise"))
 
+    score_params = {} if score_params is None else score_params
+
     try:
         if y_test is None:
-            scores = scorer(estimator, X_test)
+            scores = scorer(estimator, X_test, **score_params)
         else:
-            scores = scorer(estimator, X_test, y_test)
+            scores = scorer(estimator, X_test, y_test, **score_params)
     except Exception:
         if isinstance(scorer, _MultimetricScorer):
             # If `_MultimetricScorer` raises exception, the `error_score`
@@ -891,6 +1042,7 @@ def _score(estimator, X_test, y_test, scorer, error_score="raise"):
         "n_jobs": [Integral, None],
         "verbose": ["verbose"],
         "fit_params": [dict, None],
+        "params": [dict, None],
         "pre_dispatch": [Integral, str, None],
         "method": [
             StrOptions(
@@ -915,6 +1067,7 @@ def cross_val_predict(
     n_jobs=None,
     verbose=0,
     fit_params=None,
+    params=None,
     pre_dispatch="2*n_jobs",
     method="predict",
 ):
@@ -950,6 +1103,13 @@ def cross_val_predict(
         train/test set. Only used in conjunction with a "Group" :term:`cv`
         instance (e.g., :class:`GroupKFold`).
 
+        .. versionchanged:: 1.4
+            ``groups`` can only be passed if metadata routing is not enabled
+            via ``sklearn.set_config(enable_metadata_routing=True)``. When routing
+            is enabled, pass ``groups`` alongside other metadata via the ``params``
+            argument instead. E.g.:
+            ``cross_val_predict(..., params={'groups': groups})``.
+
     cv : int, cross-validation generator or an iterable, default=None
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
@@ -983,6 +1143,16 @@ def cross_val_predict(
     fit_params : dict, default=None
         Parameters to pass to the fit method of the estimator.
 
+        .. deprecated:: 1.4
+            This parameter is deprecated and will be removed in version 1.6. Use
+            ``params`` instead.
+
+    params : dict, default=None
+        Parameters to pass to the underlying estimator's ``fit`` and the CV
+        splitter.
+
+        .. versionadded:: 1.4
+
     pre_dispatch : int or str, default='2*n_jobs'
         Controls the number of jobs that get dispatched during parallel
         execution. Reducing this number can be useful to avoid an
@@ -1042,10 +1212,50 @@ def cross_val_predict(
     >>> lasso = linear_model.Lasso()
     >>> y_pred = cross_val_predict(lasso, X, y, cv=3)
     """
-    X, y, groups = indexable(X, y, groups)
+    params = _check_params_groups_deprecation(fit_params, params, groups)
+    X, y = indexable(X, y)
+
+    if _routing_enabled():
+        # For estimators, a MetadataRouter is created in get_metadata_routing
+        # methods. For these router methods, we create the router to use
+        # `process_routing` on it.
+        router = (
+            MetadataRouter(owner="cross_validate")
+            .add(
+                splitter=cv,
+                method_mapping=MethodMapping().add(caller="fit", callee="split"),
+            )
+            .add(
+                estimator=estimator,
+                # TODO(SLEP6): also pass metadata for the predict method.
+                method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+            )
+        )
+        try:
+            routed_params = process_routing(router, "fit", **params)
+        except UnsetMetadataPassedError as e:
+            # The default exception would mention `fit` since in the above
+            # `process_routing` code, we pass `fit` as the caller. However,
+            # the user is not calling `fit` directly, so we change the message
+            # to make it more suitable for this case.
+            raise UnsetMetadataPassedError(
+                message=(
+                    f"{sorted(e.unrequested_params.keys())} are passed to cross"
+                    " validation but are not explicitly requested or unrequested. See"
+                    " the Metadata Routing User guide"
+                    " <https://scikit-learn.org/stable/metadata_routing.html> for more"
+                    " information."
+                ),
+                unrequested_params=e.unrequested_params,
+                routed_params=e.routed_params,
+            )
+    else:
+        routed_params = Bunch()
+        routed_params.splitter = Bunch(split={"groups": groups})
+        routed_params.estimator = Bunch(fit=params)
 
     cv = check_cv(cv, y, classifier=is_classifier(estimator))
-    splits = list(cv.split(X, y, groups))
+    splits = list(cv.split(X, y, **routed_params.splitter.split))
 
     test_indices = np.concatenate([test for _, test in splits])
     if not _check_is_permutation(test_indices, _num_samples(X)):
@@ -1073,7 +1283,13 @@ def cross_val_predict(
     parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
     predictions = parallel(
         delayed(_fit_and_predict)(
-            clone(estimator), X, y, train, test, verbose, fit_params, method
+            clone(estimator),
+            X,
+            y,
+            train,
+            test,
+            routed_params.estimator.fit,
+            method,
         )
         for train, test in splits
     )
@@ -1103,7 +1319,7 @@ def cross_val_predict(
         return predictions[inv_test_indices]
 
 
-def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, method):
+def _fit_and_predict(estimator, X, y, train, test, fit_params, method):
     """Fit estimator and predict values for a given dataset split.
 
     Read more in the :ref:`User Guide <cross_validation>`.
@@ -1129,9 +1345,6 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, method):
     test : array-like of shape (n_test_samples,)
         Indices of test samples.
 
-    verbose : int
-        The verbosity level.
-
     fit_params : dict or None
         Parameters that will be passed to ``estimator.fit``.
 
@@ -1707,7 +1920,6 @@ def learning_curve(
                 test,
                 train_sizes_abs,
                 scorer,
-                verbose,
                 return_times,
                 error_score=error_score,
                 fit_params=fit_params,
@@ -1726,12 +1938,14 @@ def learning_curve(
                 clone(estimator),
                 X,
                 y,
-                scorer,
-                train,
-                test,
-                verbose,
+                scorer=scorer,
+                train=train,
+                test=test,
+                verbose=verbose,
                 parameters=None,
                 fit_params=fit_params,
+                # TODO(SLEP6): support score params here
+                score_params=None,
                 return_train_score=True,
                 error_score=error_score,
                 return_times=return_times,
@@ -1833,7 +2047,6 @@ def _incremental_fit_estimator(
     test,
     train_sizes,
     scorer,
-    verbose,
     return_times,
     error_score,
     fit_params,
@@ -1863,9 +2076,27 @@ def _incremental_fit_estimator(
 
         start_score = time.time()
 
-        test_scores.append(_score(estimator, X_test, y_test, scorer, error_score))
-        train_scores.append(_score(estimator, X_train, y_train, scorer, error_score))
-
+        # TODO(SLEP6): support score params in the following two calls
+        test_scores.append(
+            _score(
+                estimator,
+                X_test,
+                y_test,
+                scorer,
+                score_params=None,
+                error_score=error_score,
+            )
+        )
+        train_scores.append(
+            _score(
+                estimator,
+                X_train,
+                y_train,
+                scorer,
+                score_params=None,
+                error_score=error_score,
+            )
+        )
         score_time = time.time() - start_score
         score_times.append(score_time)
 
@@ -2025,12 +2256,14 @@ def validation_curve(
             clone(estimator),
             X,
             y,
-            scorer,
-            train,
-            test,
-            verbose,
+            scorer=scorer,
+            train=train,
+            test=test,
+            verbose=verbose,
             parameters={param_name: v},
             fit_params=fit_params,
+            # TODO(SLEP6): support score params here
+            score_params=None,
             return_train_score=True,
             error_score=error_score,
         )
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 6ea52049f3ced..04c3f1f156fab 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -900,18 +900,16 @@ def check_cv_results_array_types(search, param_keys, score_keys):
         assert cv_results["rank_test_%s" % key].dtype == np.int32
 
 
-def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand):
+def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand, extra_keys=()):
     # Test the search.cv_results_ contains all the required results
-    assert_array_equal(
-        sorted(cv_results.keys()), sorted(param_keys + score_keys + ("params",))
-    )
+    all_keys = param_keys + score_keys + extra_keys
+    assert_array_equal(sorted(cv_results.keys()), sorted(all_keys + ("params",)))
     assert all(cv_results[key].shape == (n_cand,) for key in param_keys + score_keys)
 
 
 def test_grid_search_cv_results():
     X, y = make_classification(n_samples=50, n_features=4, random_state=42)
 
-    n_splits = 3
     n_grid_points = 6
     params = [
         dict(
@@ -949,9 +947,7 @@ def test_grid_search_cv_results():
     )
     n_candidates = n_grid_points
 
-    search = GridSearchCV(
-        SVC(), cv=n_splits, param_grid=params, return_train_score=True
-    )
+    search = GridSearchCV(SVC(), cv=3, param_grid=params, return_train_score=True)
     search.fit(X, y)
     cv_results = search.cv_results_
     # Check if score and timing are reasonable
@@ -967,17 +963,20 @@ def test_grid_search_cv_results():
     check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
     # Check masking
     cv_results = search.cv_results_
-    n_candidates = len(search.cv_results_["params"])
-    assert all(
+
+    poly_results = [
         (
             cv_results["param_C"].mask[i]
             and cv_results["param_gamma"].mask[i]
             and not cv_results["param_degree"].mask[i]
         )
         for i in range(n_candidates)
-        if cv_results["param_kernel"][i] == "linear"
-    )
-    assert all(
+        if cv_results["param_kernel"][i] == "poly"
+    ]
+    assert all(poly_results)
+    assert len(poly_results) == 2
+
+    rbf_results = [
         (
             not cv_results["param_C"].mask[i]
             and not cv_results["param_gamma"].mask[i]
@@ -985,13 +984,14 @@ def test_grid_search_cv_results():
         )
         for i in range(n_candidates)
         if cv_results["param_kernel"][i] == "rbf"
-    )
+    ]
+    assert all(rbf_results)
+    assert len(rbf_results) == 4
 
 
 def test_random_search_cv_results():
     X, y = make_classification(n_samples=50, n_features=4, random_state=42)
 
-    n_splits = 3
     n_search_iter = 30
 
     params = [
@@ -1016,12 +1016,12 @@ def test_random_search_cv_results():
         "mean_score_time",
         "std_score_time",
     )
-    n_cand = n_search_iter
+    n_candidates = n_search_iter
 
     search = RandomizedSearchCV(
         SVC(),
         n_iter=n_search_iter,
-        cv=n_splits,
+        cv=3,
         param_distributions=params,
         return_train_score=True,
     )
@@ -1029,8 +1029,7 @@ def test_random_search_cv_results():
     cv_results = search.cv_results_
     # Check results structure
     check_cv_results_array_types(search, param_keys, score_keys)
-    check_cv_results_keys(cv_results, param_keys, score_keys, n_cand)
-    n_candidates = len(search.cv_results_["params"])
+    check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
     assert all(
         (
             cv_results["param_C"].mask[i]
@@ -1038,7 +1037,7 @@ def test_random_search_cv_results():
             and not cv_results["param_degree"].mask[i]
         )
         for i in range(n_candidates)
-        if cv_results["param_kernel"][i] == "linear"
+        if cv_results["param_kernel"][i] == "poly"
     )
     assert all(
         (
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index d92f624441541..151498205dd39 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -14,6 +14,7 @@
 )
 from scipy.special import comb
 
+from sklearn import config_context
 from sklearn.datasets import load_digits, make_classification
 from sklearn.dummy import DummyClassifier
 from sklearn.model_selection import (
@@ -43,7 +44,15 @@
     _yields_constant_splits,
 )
 from sklearn.svm import SVC
-from sklearn.tests.test_metadata_routing import assert_request_is_empty
+from sklearn.tests.metadata_routing_common import assert_request_is_empty
+from sklearn.utils._array_api import (
+    _convert_to_numpy,
+    get_namespace,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._array_api import (
+    device as array_api_device,
+)
 from sklearn.utils._mocking import MockDataFrame
 from sklearn.utils._testing import (
     assert_allclose,
@@ -51,6 +60,9 @@
     assert_array_equal,
     ignore_warnings,
 )
+from sklearn.utils.estimator_checks import (
+    _array_api_for_tests,
+)
 from sklearn.utils.validation import _num_samples
 
 NO_GROUP_SPLITTERS = [
@@ -1259,6 +1271,70 @@ def test_train_test_split_default_test_size(train_size, exp_train, exp_test):
     assert len(X_test) == exp_test
 
 
+@pytest.mark.parametrize(
+    "array_namepsace, device, dtype", yield_namespace_device_dtype_combinations()
+)
+@pytest.mark.parametrize(
+    "shuffle,stratify",
+    (
+        (True, None),
+        (True, np.hstack((np.ones(6), np.zeros(4)))),
+        # stratification only works with shuffling
+        (False, None),
+    ),
+)
+def test_array_api_train_test_split(shuffle, stratify, array_namepsace, device, dtype):
+    xp, device, dtype = _array_api_for_tests(array_namepsace, device, dtype)
+
+    X = np.arange(100).reshape((10, 10))
+    y = np.arange(10)
+
+    X_np = X.astype(dtype)
+    X_xp = xp.asarray(X_np, device=device)
+
+    y_np = y.astype(dtype)
+    y_xp = xp.asarray(y_np, device=device)
+
+    X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
+        X_np, y, random_state=0, shuffle=shuffle, stratify=stratify
+    )
+    with config_context(array_api_dispatch=True):
+        if stratify is not None:
+            stratify_xp = xp.asarray(stratify)
+        else:
+            stratify_xp = stratify
+        X_train_xp, X_test_xp, y_train_xp, y_test_xp = train_test_split(
+            X_xp, y_xp, shuffle=shuffle, stratify=stratify_xp, random_state=0
+        )
+
+        # Check that namespace is preserved, has to happen with
+        # array_api_dispatch enabled.
+        assert get_namespace(X_train_xp)[0] == get_namespace(X_xp)[0]
+        assert get_namespace(X_test_xp)[0] == get_namespace(X_xp)[0]
+        assert get_namespace(y_train_xp)[0] == get_namespace(y_xp)[0]
+        assert get_namespace(y_test_xp)[0] == get_namespace(y_xp)[0]
+
+    # Check device and dtype is preserved on output
+    assert array_api_device(X_train_xp) == array_api_device(X_xp)
+    assert array_api_device(y_train_xp) == array_api_device(y_xp)
+    assert array_api_device(X_test_xp) == array_api_device(X_xp)
+    assert array_api_device(y_test_xp) == array_api_device(y_xp)
+
+    assert X_train_xp.dtype == X_xp.dtype
+    assert y_train_xp.dtype == y_xp.dtype
+    assert X_test_xp.dtype == X_xp.dtype
+    assert y_test_xp.dtype == y_xp.dtype
+
+    assert_allclose(
+        _convert_to_numpy(X_train_xp, xp=xp),
+        X_train_np,
+    )
+    assert_allclose(
+        _convert_to_numpy(X_test_xp, xp=xp),
+        X_test_np,
+    )
+
+
 def test_train_test_split():
     X = np.arange(100).reshape((10, 10))
     X_s = coo_matrix(X)
@@ -1808,7 +1884,7 @@ def test_nested_cv():
             error_score="raise",
         )
         cross_val_score(
-            gs, X=X, y=y, groups=groups, cv=outer_cv, fit_params={"groups": groups}
+            gs, X=X, y=y, groups=groups, cv=outer_cv, params={"groups": groups}
         )
 
 
diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py
index d4cc09ee01044..6c89f89afa684 100644
--- a/sklearn/model_selection/tests/test_successive_halving.py
+++ b/sklearn/model_selection/tests/test_successive_halving.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 import pytest
-from scipy.stats import norm, randint
+from scipy.stats import expon, norm, randint
 
 from sklearn.datasets import make_classification
 from sklearn.dummy import DummyClassifier
@@ -23,7 +23,11 @@
     _SubsampleMetaSplitter,
     _top_k,
 )
-from sklearn.svm import LinearSVC
+from sklearn.model_selection.tests.test_search import (
+    check_cv_results_array_types,
+    check_cv_results_keys,
+)
+from sklearn.svm import SVC, LinearSVC
 
 
 class FastClassifier(DummyClassifier):
@@ -777,3 +781,68 @@ def test_select_best_index(SearchCV):
     # we expect the index of 'i'
     best_index = SearchCV._select_best_index(None, None, results)
     assert best_index == 8
+
+
+def test_halving_random_search_list_of_dicts():
+    """Check the behaviour of the `HalvingRandomSearchCV` with `param_distribution`
+    being a list of dictionary.
+    """
+    X, y = make_classification(n_samples=150, n_features=4, random_state=42)
+
+    params = [
+        {"kernel": ["rbf"], "C": expon(scale=10), "gamma": expon(scale=0.1)},
+        {"kernel": ["poly"], "degree": [2, 3]},
+    ]
+    param_keys = (
+        "param_C",
+        "param_degree",
+        "param_gamma",
+        "param_kernel",
+    )
+    score_keys = (
+        "mean_test_score",
+        "mean_train_score",
+        "rank_test_score",
+        "split0_test_score",
+        "split1_test_score",
+        "split2_test_score",
+        "split0_train_score",
+        "split1_train_score",
+        "split2_train_score",
+        "std_test_score",
+        "std_train_score",
+        "mean_fit_time",
+        "std_fit_time",
+        "mean_score_time",
+        "std_score_time",
+    )
+    extra_keys = ("n_resources", "iter")
+
+    search = HalvingRandomSearchCV(
+        SVC(), cv=3, param_distributions=params, return_train_score=True, random_state=0
+    )
+    search.fit(X, y)
+    n_candidates = sum(search.n_candidates_)
+    cv_results = search.cv_results_
+    # Check results structure
+    check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates, extra_keys)
+    check_cv_results_array_types(search, param_keys, score_keys)
+
+    assert all(
+        (
+            cv_results["param_C"].mask[i]
+            and cv_results["param_gamma"].mask[i]
+            and not cv_results["param_degree"].mask[i]
+        )
+        for i in range(n_candidates)
+        if cv_results["param_kernel"][i] == "poly"
+    )
+    assert all(
+        (
+            not cv_results["param_C"].mask[i]
+            and not cv_results["param_gamma"].mask[i]
+            and cv_results["param_degree"].mask[i]
+        )
+        for i in range(n_candidates)
+        if cv_results["param_kernel"][i] == "rbf"
+    )
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index ba9f66ab240e4..c944b06b30860 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -73,6 +73,13 @@
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import LabelEncoder, scale
 from sklearn.svm import SVC, LinearSVC
+from sklearn.tests.metadata_routing_common import (
+    ConsumingClassifier,
+    ConsumingScorer,
+    ConsumingSplitter,
+    _Registry,
+    check_recorded_metadata,
+)
 from sklearn.utils import shuffle
 from sklearn.utils._mocking import CheckingClassifier, MockDataFrame
 from sklearn.utils._testing import (
@@ -706,7 +713,7 @@ def assert_fit_params(clf):
         "dummy_obj": DUMMY_OBJ,
         "callback": assert_fit_params,
     }
-    cross_val_score(clf, X, y, fit_params=fit_params)
+    cross_val_score(clf, X, y, params=fit_params)
 
 
 def test_cross_val_score_score_func():
@@ -1160,7 +1167,7 @@ def test_cross_val_score_sparse_fit_params():
     X, y = iris.data, iris.target
     clf = MockClassifier()
     fit_params = {"sparse_sample_weight": coo_matrix(np.eye(X.shape[0]))}
-    a = cross_val_score(clf, X, y, fit_params=fit_params, cv=3)
+    a = cross_val_score(clf, X, y, params=fit_params, cv=3)
     assert_array_equal(a, np.ones(3))
 
 
@@ -2082,12 +2089,23 @@ def test_fit_and_score_failing():
     failing_clf = FailingClassifier(FailingClassifier.FAILING_PARAMETER)
     # dummy X data
     X = np.arange(1, 10)
-    fit_and_score_args = [failing_clf, X, None, dict(), None, None, 0, None, None]
+    fit_and_score_args = dict(
+        estimator=failing_clf,
+        X=X,
+        y=None,
+        scorer=dict(),
+        train=None,
+        test=None,
+        verbose=0,
+        parameters=None,
+        fit_params=None,
+        score_params=None,
+    )
     # passing error score to trigger the warning message
-    fit_and_score_kwargs = {"error_score": "raise"}
+    fit_and_score_args["error_score"] = "raise"
     # check if exception was raised, with default error_score='raise'
     with pytest.raises(ValueError, match="Failing classifier failed as required"):
-        _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs)
+        _fit_and_score(**fit_and_score_args)
 
     assert failing_clf.score() == 0.0  # FailingClassifier coverage
 
@@ -2097,14 +2115,21 @@ def test_fit_and_score_working():
     clf = SVC(kernel="linear", random_state=0)
     train, test = next(ShuffleSplit().split(X))
     # Test return_parameters option
-    fit_and_score_args = [clf, X, y, dict(), train, test, 0]
-    fit_and_score_kwargs = {
-        "parameters": {"max_iter": 100, "tol": 0.1},
-        "fit_params": None,
-        "return_parameters": True,
-    }
-    result = _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs)
-    assert result["parameters"] == fit_and_score_kwargs["parameters"]
+    fit_and_score_args = dict(
+        estimator=clf,
+        X=X,
+        y=y,
+        scorer=dict(),
+        train=train,
+        test=test,
+        verbose=0,
+        parameters={"max_iter": 100, "tol": 0.1},
+        fit_params=None,
+        score_params=None,
+        return_parameters=True,
+    )
+    result = _fit_and_score(**fit_and_score_args)
+    assert result["parameters"] == fit_and_score_args["parameters"]
 
 
 class DataDependentFailingClassifier(BaseEstimator):
@@ -2315,13 +2340,22 @@ def test_fit_and_score_verbosity(
     train, test = next(ShuffleSplit().split(X))
 
     # test print without train score
-    fit_and_score_args = [clf, X, y, scorer, train, test, verbose, None, None]
-    fit_and_score_kwargs = {
-        "return_train_score": train_score,
-        "split_progress": split_prg,
-        "candidate_progress": cdt_prg,
-    }
-    _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs)
+    fit_and_score_args = dict(
+        estimator=clf,
+        X=X,
+        y=y,
+        scorer=scorer,
+        train=train,
+        test=test,
+        verbose=verbose,
+        parameters=None,
+        fit_params=None,
+        score_params=None,
+        return_train_score=train_score,
+        split_progress=split_prg,
+        candidate_progress=cdt_prg,
+    )
+    _fit_and_score(**fit_and_score_args)
     out, _ = capsys.readouterr()
     outlines = out.split("\n")
     if len(outlines) > 2:
@@ -2336,9 +2370,15 @@ def test_score():
     def two_params_scorer(estimator, X_test):
         return None
 
-    fit_and_score_args = [None, None, None, two_params_scorer]
     with pytest.raises(ValueError, match=error_message):
-        _score(*fit_and_score_args, error_score=np.nan)
+        _score(
+            estimator=None,
+            X_test=None,
+            y_test=None,
+            scorer=two_params_scorer,
+            score_params=None,
+            error_score=np.nan,
+        )
 
 
 def test_callable_multimetric_confusion_matrix_cross_validate():
@@ -2391,3 +2431,149 @@ def test_cross_validate_return_indices(global_random_seed):
     for split_idx, (expected_train_idx, expected_test_idx) in enumerate(cv.split(X, y)):
         assert_array_equal(train_indices[split_idx], expected_train_idx)
         assert_array_equal(test_indices[split_idx], expected_test_idx)
+
+
+# Tests for metadata routing in cross_val*
+# ========================================
+
+
+# TODO(1.6): remove this test in 1.6
+def test_cross_validate_fit_param_deprecation():
+    """Check that we warn about deprecating `fit_params`."""
+    with pytest.warns(FutureWarning, match="`fit_params` is deprecated"):
+        cross_validate(estimator=ConsumingClassifier(), X=X, y=y, cv=2, fit_params={})
+
+    with pytest.raises(
+        ValueError, match="`params` and `fit_params` cannot both be provided"
+    ):
+        cross_validate(
+            estimator=ConsumingClassifier(), X=X, y=y, fit_params={}, params={}
+        )
+
+
+@pytest.mark.usefixtures("enable_slep006")
+@pytest.mark.parametrize(
+    "cv_method", [cross_validate, cross_val_score, cross_val_predict]
+)
+def test_groups_with_routing_validation(cv_method):
+    """Check that we raise an error if `groups` are passed to the cv method instead
+    of `params` when metadata routing is enabled.
+    """
+    with pytest.raises(ValueError, match="`groups` can only be passed if"):
+        cv_method(
+            estimator=ConsumingClassifier(),
+            X=X,
+            y=y,
+            groups=[],
+        )
+
+
+@pytest.mark.usefixtures("enable_slep006")
+@pytest.mark.parametrize(
+    "cv_method", [cross_validate, cross_val_score, cross_val_predict]
+)
+def test_passed_unrequested_metadata(cv_method):
+    """Check that we raise an error when passing metadata that is not
+    requested."""
+    err_msg = re.escape("['metadata'] are passed to cross validation")
+    with pytest.raises(ValueError, match=err_msg):
+        cv_method(
+            estimator=ConsumingClassifier(),
+            X=X,
+            y=y,
+            params=dict(metadata=[]),
+        )
+
+
+@pytest.mark.usefixtures("enable_slep006")
+@pytest.mark.parametrize(
+    "cv_method", [cross_validate, cross_val_score, cross_val_predict]
+)
+def test_cross_validate_routing(cv_method):
+    """Check that the respective cv method is properly dispatching the metadata
+    to the consumer."""
+    scorer_registry = _Registry()
+    scorer = ConsumingScorer(registry=scorer_registry).set_score_request(
+        sample_weight="score_weights", metadata="score_metadata"
+    )
+    splitter_registry = _Registry()
+    splitter = ConsumingSplitter(registry=splitter_registry).set_split_request(
+        groups="split_groups", metadata="split_metadata"
+    )
+    estimator_registry = _Registry()
+    estimator = ConsumingClassifier(registry=estimator_registry).set_fit_request(
+        sample_weight="fit_sample_weight", metadata="fit_metadata"
+    )
+    n_samples = _num_samples(X)
+    rng = np.random.RandomState(0)
+    score_weights = rng.rand(n_samples)
+    score_metadata = rng.rand(n_samples)
+    split_groups = rng.randint(0, 3, n_samples)
+    split_metadata = rng.rand(n_samples)
+    fit_sample_weight = rng.rand(n_samples)
+    fit_metadata = rng.rand(n_samples)
+
+    extra_params = {
+        cross_validate: dict(scoring=dict(my_scorer=scorer, accuracy="accuracy")),
+        # cross_val_score doesn't support multiple scorers
+        cross_val_score: dict(scoring=scorer),
+        # cross_val_predict doesn't need a scorer
+        cross_val_predict: dict(),
+    }
+
+    params = dict(
+        split_groups=split_groups,
+        split_metadata=split_metadata,
+        fit_sample_weight=fit_sample_weight,
+        fit_metadata=fit_metadata,
+    )
+
+    if cv_method is not cross_val_predict:
+        params.update(
+            score_weights=score_weights,
+            score_metadata=score_metadata,
+        )
+
+    cv_method(
+        estimator,
+        X=X,
+        y=y,
+        cv=splitter,
+        **extra_params[cv_method],
+        params=params,
+    )
+
+    if cv_method is not cross_val_predict:
+        # cross_val_predict doesn't need a scorer
+        assert len(scorer_registry)
+    for _scorer in scorer_registry:
+        check_recorded_metadata(
+            obj=_scorer,
+            method="score",
+            split_params=("sample_weight", "metadata"),
+            sample_weight=score_weights,
+            metadata=score_metadata,
+        )
+
+    assert len(splitter_registry)
+    for _splitter in splitter_registry:
+        check_recorded_metadata(
+            obj=_splitter,
+            method="split",
+            groups=split_groups,
+            metadata=split_metadata,
+        )
+
+    assert len(estimator_registry)
+    for _estimator in estimator_registry:
+        check_recorded_metadata(
+            obj=_estimator,
+            method="fit",
+            split_params=("sample_weight", "metadata"),
+            sample_weight=fit_sample_weight,
+            metadata=fit_metadata,
+        )
+
+
+# End of metadata routing tests
+# =============================
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 8bd71924f954b..a75f41307b758 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -163,10 +163,10 @@ def partial_fit(self, X, y, classes=None, sample_weight=None, **partial_fit_para
 
         if _routing_enabled():
             routed_params = process_routing(
-                obj=self,
-                method="partial_fit",
-                other_params=partial_fit_params,
+                self,
+                "partial_fit",
                 sample_weight=sample_weight,
+                **partial_fit_params,
             )
         else:
             if sample_weight is not None and not has_fit_parameter(
@@ -249,10 +249,10 @@ def fit(self, X, y, sample_weight=None, **fit_params):
 
         if _routing_enabled():
             routed_params = process_routing(
-                obj=self,
-                method="fit",
-                other_params=fit_params,
+                self,
+                "fit",
                 sample_weight=sample_weight,
+                **fit_params,
             )
         else:
             if sample_weight is not None and not has_fit_parameter(
@@ -706,9 +706,7 @@ def fit(self, X, Y, **fit_params):
         del Y_pred_chain
 
         if _routing_enabled():
-            routed_params = process_routing(
-                obj=self, method="fit", other_params=fit_params
-            )
+            routed_params = process_routing(self, "fit", **fit_params)
         else:
             routed_params = Bunch(estimator=Bunch(fit=fit_params))
 
diff --git a/sklearn/neighbors/_ball_tree.pyx b/sklearn/neighbors/_ball_tree.pyx
deleted file mode 100644
index d9b933cb43c66..0000000000000
--- a/sklearn/neighbors/_ball_tree.pyx
+++ /dev/null
@@ -1,195 +0,0 @@
-# Author: Jake Vanderplas <vanderplas@astro.washington.edu>
-# License: BSD 3 clause
-
-__all__ = ['BallTree']
-
-DOC_DICT = {'BinaryTree': 'BallTree', 'binary_tree': 'ball_tree'}
-
-VALID_METRICS = [
-    'BrayCurtisDistance64',
-    'CanberraDistance64',
-    'ChebyshevDistance64',
-    'DiceDistance64',
-    'EuclideanDistance64',
-    'HammingDistance64',
-    'HaversineDistance64',
-    'JaccardDistance64',
-    'MahalanobisDistance64',
-    'ManhattanDistance64',
-    'MinkowskiDistance64',
-    'PyFuncDistance64',
-    'RogersTanimotoDistance64',
-    'RussellRaoDistance64',
-    'SEuclideanDistance64',
-    'SokalMichenerDistance64',
-    'SokalSneathDistance64',
-    'WMinkowskiDistance64',
-]
-
-include "_binary_tree.pxi"
-
-# Inherit BallTree from BinaryTree
-cdef class BallTree(BinaryTree):
-    __doc__ = CLASS_DOC.format(**DOC_DICT)
-    pass
-
-
-# ----------------------------------------------------------------------
-# The functions below specialized the Binary Tree as a Ball Tree
-#
-#   Note that these functions use the concept of "reduced distance".
-#   The reduced distance, defined for some metrics, is a quantity which
-#   is more efficient to compute than the distance, but preserves the
-#   relative rankings of the true distance.  For example, the reduced
-#   distance for the Euclidean metric is the squared-euclidean distance.
-#   For some metrics, the reduced distance is simply the distance.
-
-cdef int allocate_data(BinaryTree tree, intp_t n_nodes,
-                       intp_t n_features) except -1:
-    """Allocate arrays needed for the KD Tree"""
-    tree.node_bounds = np.zeros((1, n_nodes, n_features), dtype=np.float64)
-    return 0
-
-
-cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, intp_t i_node,
-                   intp_t idx_start, intp_t idx_end) except -1:
-    """Initialize the node for the dataset stored in tree.data"""
-    cdef intp_t n_features = tree.data.shape[1]
-    cdef intp_t n_points = idx_end - idx_start
-
-    cdef intp_t i, j
-    cdef float64_t radius
-    cdef float64_t *this_pt
-
-    cdef intp_t* idx_array = &tree.idx_array[0]
-    cdef float64_t* data = &tree.data[0, 0]
-    cdef float64_t* centroid = &tree.node_bounds[0, i_node, 0]
-
-    cdef bint with_sample_weight = tree.sample_weight is not None
-    cdef float64_t* sample_weight
-    cdef float64_t sum_weight_node
-    if with_sample_weight:
-        sample_weight = &tree.sample_weight[0]
-
-    # determine Node centroid
-    for j in range(n_features):
-        centroid[j] = 0
-
-    if with_sample_weight:
-        sum_weight_node = 0
-        for i in range(idx_start, idx_end):
-            sum_weight_node += sample_weight[idx_array[i]]
-            this_pt = data + n_features * idx_array[i]
-            for j from 0 <= j < n_features:
-                centroid[j] += this_pt[j] * sample_weight[idx_array[i]]
-
-        for j in range(n_features):
-            centroid[j] /= sum_weight_node
-    else:
-        for i in range(idx_start, idx_end):
-            this_pt = data + n_features * idx_array[i]
-            for j from 0 <= j < n_features:
-                centroid[j] += this_pt[j]
-
-        for j in range(n_features):
-            centroid[j] /= n_points
-
-    # determine Node radius
-    radius = 0
-    for i in range(idx_start, idx_end):
-        radius = fmax(radius,
-                      tree.rdist(centroid,
-                                 data + n_features * idx_array[i],
-                                 n_features))
-
-    node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius)
-    node_data[i_node].idx_start = idx_start
-    node_data[i_node].idx_end = idx_end
-    return 0
-
-
-cdef inline float64_t min_dist(BinaryTree tree, intp_t i_node,
-                               float64_t* pt) except -1 nogil:
-    """Compute the minimum distance between a point and a node"""
-    cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0],
-                                       tree.data.shape[1])
-    return fmax(0, dist_pt - tree.node_data[i_node].radius)
-
-
-cdef inline float64_t max_dist(BinaryTree tree, intp_t i_node,
-                               float64_t* pt) except -1:
-    """Compute the maximum distance between a point and a node"""
-    cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0],
-                                       tree.data.shape[1])
-    return dist_pt + tree.node_data[i_node].radius
-
-
-cdef inline int min_max_dist(BinaryTree tree, intp_t i_node, float64_t* pt,
-                             float64_t* min_dist, float64_t* max_dist) except -1 nogil:
-    """Compute the minimum and maximum distance between a point and a node"""
-    cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0],
-                                       tree.data.shape[1])
-    cdef float64_t rad = tree.node_data[i_node].radius
-    min_dist[0] = fmax(0, dist_pt - rad)
-    max_dist[0] = dist_pt + rad
-    return 0
-
-
-cdef inline float64_t min_rdist(BinaryTree tree, intp_t i_node,
-                                float64_t* pt) except -1 nogil:
-    """Compute the minimum reduced-distance between a point and a node"""
-    if tree.euclidean:
-        return euclidean_dist_to_rdist64(min_dist(tree, i_node, pt))
-    else:
-        return tree.dist_metric._dist_to_rdist(min_dist(tree, i_node, pt))
-
-
-cdef inline float64_t max_rdist(BinaryTree tree, intp_t i_node,
-                                float64_t* pt) except -1:
-    """Compute the maximum reduced-distance between a point and a node"""
-    if tree.euclidean:
-        return euclidean_dist_to_rdist64(max_dist(tree, i_node, pt))
-    else:
-        return tree.dist_metric._dist_to_rdist(max_dist(tree, i_node, pt))
-
-
-cdef inline float64_t min_dist_dual(BinaryTree tree1, intp_t i_node1,
-                                    BinaryTree tree2, intp_t i_node2) except -1:
-    """compute the minimum distance between two nodes"""
-    cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0],
-                                        &tree1.node_bounds[0, i_node1, 0],
-                                        tree1.data.shape[1])
-    return fmax(0, (dist_pt - tree1.node_data[i_node1].radius
-                    - tree2.node_data[i_node2].radius))
-
-
-cdef inline float64_t max_dist_dual(BinaryTree tree1, intp_t i_node1,
-                                    BinaryTree tree2, intp_t i_node2) except -1:
-    """compute the maximum distance between two nodes"""
-    cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0],
-                                        &tree1.node_bounds[0, i_node1, 0],
-                                        tree1.data.shape[1])
-    return (dist_pt + tree1.node_data[i_node1].radius
-            + tree2.node_data[i_node2].radius)
-
-
-cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1,
-                                     BinaryTree tree2, intp_t i_node2) except -1:
-    """compute the minimum reduced distance between two nodes"""
-    if tree1.euclidean:
-        return euclidean_dist_to_rdist64(min_dist_dual(tree1, i_node1,
-                                                       tree2, i_node2))
-    else:
-        return tree1.dist_metric._dist_to_rdist(min_dist_dual(tree1, i_node1,
-                                                              tree2, i_node2))
-
-
-cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1,
-                                     BinaryTree tree2, intp_t i_node2) except -1:
-    """compute the maximum reduced distance between two nodes"""
-    if tree1.euclidean:
-        return euclidean_dist_to_rdist64(max_dist_dual(tree1, i_node1,
-                                                       tree2, i_node2))
-    else:
-        return tree1.dist_metric._dist_to_rdist(max_dist_dual(tree1, i_node1,
-                                                              tree2, i_node2))
diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp
new file mode 100644
index 0000000000000..92b26714e5d9f
--- /dev/null
+++ b/sklearn/neighbors/_ball_tree.pyx.tp
@@ -0,0 +1,284 @@
+{{py:
+
+# Generated file: _ball_tree.pyx
+
+implementation_specific_values = [
+    # The values are arranged as follows:
+    #
+    #       name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
+    #
+    ('64', 'float64_t', 'np.float64'),
+    ('32', 'float32_t', 'np.float32')
+]
+
+# Author: Jake Vanderplas <vanderplas@astro.washington.edu>
+# License: BSD 3 clause
+
+}}
+
+
+__all__ = ['BallTree', 'BallTree64', 'BallTree32']
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+DOC_DICT{{name_suffix}} = {
+    'BinaryTree': 'BallTree{{name_suffix}}',
+    'binary_tree': 'ball_tree{{name_suffix}}',
+}
+
+VALID_METRICS{{name_suffix}} = [
+    'BrayCurtisDistance{{name_suffix}}',
+    'CanberraDistance{{name_suffix}}',
+    'ChebyshevDistance{{name_suffix}}',
+    'DiceDistance{{name_suffix}}',
+    'EuclideanDistance{{name_suffix}}',
+    'HammingDistance{{name_suffix}}',
+    'HaversineDistance{{name_suffix}}',
+    'JaccardDistance{{name_suffix}}',
+    'MahalanobisDistance{{name_suffix}}',
+    'ManhattanDistance{{name_suffix}}',
+    'MinkowskiDistance{{name_suffix}}',
+    'PyFuncDistance{{name_suffix}}',
+    'RogersTanimotoDistance{{name_suffix}}',
+    'RussellRaoDistance{{name_suffix}}',
+    'SEuclideanDistance{{name_suffix}}',
+    'SokalMichenerDistance{{name_suffix}}',
+    'SokalSneathDistance{{name_suffix}}',
+    'WMinkowskiDistance{{name_suffix}}',
+]
+
+{{endfor}}
+
+include "_binary_tree.pxi"
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+# Inherit BallTree{{name_suffix}} from BinaryTree{{name_suffix}}
+cdef class BallTree{{name_suffix}}(BinaryTree{{name_suffix}}):
+    __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}})
+    pass
+
+{{endfor}}
+
+
+#----------------------------------------------------------------------
+# The functions below specialized the Binary Tree as a Ball Tree
+#
+#   Note that these functions use the concept of "reduced distance".
+#   The reduced distance, defined for some metrics, is a quantity which
+#   is more efficient to compute than the distance, but preserves the
+#   relative rankings of the true distance.  For example, the reduced
+#   distance for the Euclidean metric is the squared-euclidean distance.
+#   For some metrics, the reduced distance is simply the distance.
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+cdef int allocate_data{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t n_nodes,
+    intp_t n_features,
+) except -1:
+    """Allocate arrays needed for the KD Tree"""
+    tree.node_bounds = np.zeros((1, n_nodes, n_features), dtype={{INPUT_DTYPE}})
+    return 0
+
+
+cdef int init_node{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    NodeData_t[::1] node_data,
+    intp_t i_node,
+    intp_t idx_start,
+    intp_t idx_end,
+) except -1:
+    """Initialize the node for the dataset stored in tree.data"""
+    cdef intp_t n_features = tree.data.shape[1]
+    cdef intp_t n_points = idx_end - idx_start
+
+    cdef intp_t i, j
+    cdef float64_t radius
+    cdef {{INPUT_DTYPE_t}} *this_pt
+
+    cdef intp_t* idx_array = &tree.idx_array[0]
+    cdef {{INPUT_DTYPE_t}}* data = &tree.data[0, 0]
+    cdef {{INPUT_DTYPE_t}}* centroid = &tree.node_bounds[0, i_node, 0]
+
+    cdef bint with_sample_weight = tree.sample_weight is not None
+    cdef {{INPUT_DTYPE_t}}* sample_weight
+    cdef float64_t sum_weight_node
+    if with_sample_weight:
+        sample_weight = &tree.sample_weight[0]
+
+    # determine Node centroid
+    for j in range(n_features):
+        centroid[j] = 0
+
+    if with_sample_weight:
+        sum_weight_node = 0
+        for i in range(idx_start, idx_end):
+            sum_weight_node += sample_weight[idx_array[i]]
+            this_pt = data + n_features * idx_array[i]
+            for j from 0 <= j < n_features:
+                centroid[j] += this_pt[j] * sample_weight[idx_array[i]]
+
+        for j in range(n_features):
+            centroid[j] /= sum_weight_node
+    else:
+        for i in range(idx_start, idx_end):
+            this_pt = data + n_features * idx_array[i]
+            for j from 0 <= j < n_features:
+                centroid[j] += this_pt[j]
+
+        for j in range(n_features):
+            centroid[j] /= n_points
+
+    # determine Node radius
+    radius = 0
+    for i in range(idx_start, idx_end):
+        radius = fmax(radius,
+                      tree.rdist(centroid,
+                                 data + n_features * idx_array[i],
+                                 n_features))
+
+    node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius)
+    node_data[i_node].idx_start = idx_start
+    node_data[i_node].idx_end = idx_end
+    return 0
+
+
+cdef inline float64_t min_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    {{INPUT_DTYPE_t}}* pt,
+) except -1 nogil:
+    """Compute the minimum distance between a point and a node"""
+    cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0],
+                                     tree.data.shape[1])
+    return fmax(0, dist_pt - tree.node_data[i_node].radius)
+
+
+cdef inline float64_t max_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    {{INPUT_DTYPE_t}}* pt,
+) except -1:
+    """Compute the maximum distance between a point and a node"""
+    cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0],
+                                     tree.data.shape[1])
+    return dist_pt + tree.node_data[i_node].radius
+
+
+cdef inline int min_max_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    {{INPUT_DTYPE_t}}* pt,
+    float64_t* min_dist,
+    float64_t* max_dist,
+) except -1 nogil:
+    """Compute the minimum and maximum distance between a point and a node"""
+    cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0],
+                                     tree.data.shape[1])
+    cdef float64_t rad = tree.node_data[i_node].radius
+    min_dist[0] = fmax(0, dist_pt - rad)
+    max_dist[0] = dist_pt + rad
+    return 0
+
+
+cdef inline float64_t min_rdist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    {{INPUT_DTYPE_t}}* pt,
+) except -1 nogil:
+    """Compute the minimum reduced-distance between a point and a node"""
+    if tree.euclidean:
+        return euclidean_dist_to_rdist{{name_suffix}}(
+            min_dist{{name_suffix}}(tree, i_node, pt)
+        )
+    else:
+        return tree.dist_metric._dist_to_rdist(
+            min_dist{{name_suffix}}(tree, i_node, pt)
+        )
+
+
+cdef inline float64_t max_rdist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    {{INPUT_DTYPE_t}}* pt,
+) except -1:
+    """Compute the maximum reduced-distance between a point and a node"""
+    if tree.euclidean:
+        return euclidean_dist_to_rdist{{name_suffix}}(
+            max_dist{{name_suffix}}(tree, i_node, pt)
+        )
+    else:
+        return tree.dist_metric._dist_to_rdist(
+            max_dist{{name_suffix}}(tree, i_node, pt)
+        )
+
+
+cdef inline float64_t min_dist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """compute the minimum distance between two nodes"""
+    cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0],
+                                      &tree1.node_bounds[0, i_node1, 0],
+                                      tree1.data.shape[1])
+    return fmax(0, (dist_pt - tree1.node_data[i_node1].radius
+                    - tree2.node_data[i_node2].radius))
+
+
+cdef inline float64_t max_dist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """compute the maximum distance between two nodes"""
+    cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0],
+                                      &tree1.node_bounds[0, i_node1, 0],
+                                      tree1.data.shape[1])
+    return (dist_pt + tree1.node_data[i_node1].radius
+            + tree2.node_data[i_node2].radius)
+
+
+cdef inline float64_t min_rdist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """compute the minimum reduced distance between two nodes"""
+    if tree1.euclidean:
+        return euclidean_dist_to_rdist{{name_suffix}}(
+            min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+        )
+    else:
+        return tree1.dist_metric._dist_to_rdist(
+            min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+        )
+
+
+cdef inline float64_t max_rdist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
+    """compute the maximum reduced distance between two nodes"""
+    if tree1.euclidean:
+        return euclidean_dist_to_rdist{{name_suffix}}(
+            max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+        )
+    else:
+        return tree1.dist_metric._dist_to_rdist(
+            max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+        )
+
+{{endfor}}
+
+
+class BallTree(BallTree64):
+    __doc__ = CLASS_DOC.format(BinaryTree="BallTree")
+    pass
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index dcff18e10fa48..519db9bead3d3 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -19,7 +19,7 @@
 
 from ..base import BaseEstimator, MultiOutputMixin, is_classifier
 from ..exceptions import DataConversionWarning, EfficiencyWarning
-from ..metrics import pairwise_distances_chunked
+from ..metrics import DistanceMetric, pairwise_distances_chunked
 from ..metrics._pairwise_distances_reduction import (
     ArgKmin,
     RadiusNeighbors,
@@ -414,7 +414,11 @@ def _check_algorithm_metric(self):
         if self.algorithm == "auto":
             if self.metric == "precomputed":
                 alg_check = "brute"
-            elif callable(self.metric) or self.metric in VALID_METRICS["ball_tree"]:
+            elif (
+                callable(self.metric)
+                or self.metric in VALID_METRICS["ball_tree"]
+                or isinstance(self.metric, DistanceMetric)
+            ):
                 alg_check = "ball_tree"
             else:
                 alg_check = "brute"
@@ -430,7 +434,9 @@ def _check_algorithm_metric(self):
                     "in very poor performance."
                     % self.metric
                 )
-        elif self.metric not in VALID_METRICS[alg_check]:
+        elif self.metric not in VALID_METRICS[alg_check] and not isinstance(
+            self.metric, DistanceMetric
+        ):
             raise ValueError(
                 "Metric '%s' not valid. Use "
                 "sorted(sklearn.neighbors.VALID_METRICS['%s']) "
@@ -563,9 +569,11 @@ def _fit(self, X, y=None):
             if self.algorithm not in ("auto", "brute"):
                 warnings.warn("cannot use tree with sparse input: using brute force")
 
-            if self.effective_metric_ not in VALID_METRICS_SPARSE[
-                "brute"
-            ] and not callable(self.effective_metric_):
+            if (
+                self.effective_metric_ not in VALID_METRICS_SPARSE["brute"]
+                and not callable(self.effective_metric_)
+                and not isinstance(self.effective_metric_, DistanceMetric)
+            ):
                 raise ValueError(
                     "Metric '%s' not valid for sparse input. "
                     "Use sorted(sklearn.neighbors."
diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi.tp
similarity index 90%
rename from sklearn/neighbors/_binary_tree.pxi
rename to sklearn/neighbors/_binary_tree.pxi.tp
index b60ea3a0a6d70..6322f809f7eb9 100644
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi.tp
@@ -1,14 +1,32 @@
-#!python
+{{py:
 
+# Generated file: _binary_tree.pxi
+
+implementation_specific_values = [
+    # The values are arranged as follows:
+    #
+    #       name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE
+    #
+    ('64', 'float64_t', 'np.float64', 'cnp.NPY_DOUBLE'),
+    ('32', 'float32_t', 'np.float32', 'cnp.NPY_FLOAT')
+]
 
 # KD Tree and Ball Tree
 # =====================
 #
 #    Author: Jake Vanderplas <jakevdp@cs.washington.edu>, 2012-2013
+#            Omar Salman <omar.salman@arbisoft.com>
+#
 #    License: BSD
 #
-# This file is meant to be a literal include in a pyx file.
-# See ball_tree.pyx and kd_tree.pyx
+# _binary_tree.pxi is generated and is then literally Cython included in
+# ball_tree.pyx and kd_tree.pyx. See ball_tree.pyx.tp and kd_tree.pyx.tp.
+
+}}
+
+
+# KD Tree and Ball Tree
+# =====================
 #
 # The routines here are the core algorithms of the KDTree and BallTree
 # structures.  If Cython supported polymorphism, we would be able to
@@ -143,6 +161,7 @@
 #     """Compute the maximum distance between two nodes"""
 
 cimport numpy as cnp
+from cython cimport floating
 from libc.math cimport fabs, sqrt, exp, cos, pow, log, lgamma
 from libc.math cimport fmin, fmax
 from libc.stdlib cimport calloc, malloc, free
@@ -154,15 +173,19 @@ import warnings
 from ..metrics._dist_metrics cimport (
     DistanceMetric,
     DistanceMetric64,
+    DistanceMetric32,
     euclidean_dist64,
+    euclidean_dist32,
     euclidean_rdist64,
+    euclidean_rdist32,
     euclidean_dist_to_rdist64,
+    euclidean_dist_to_rdist32,
 )
 
 from ._partition_nodes cimport partition_node_indices
 
 from ..utils import check_array
-from ..utils._typedefs cimport float64_t, intp_t
+from ..utils._typedefs cimport float32_t, float64_t, intp_t
 from ..utils._heap cimport heap_push
 from ..utils._sorting cimport simultaneous_sort as _simultaneous_sort
 
@@ -500,8 +523,9 @@ def kernel_norm(h, d, kernel, return_log=False):
     else:
         return np.exp(result)
 
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}}
 
-cdef class NeighborsHeap:
+cdef class NeighborsHeap{{name_suffix}}:
     """A max-heap structure to keep track of distances/indices of neighbors
 
     This implements an efficient pre-allocated set of fixed-size heaps
@@ -516,19 +540,19 @@ cdef class NeighborsHeap:
     n_nbrs : int
         the size of each heap.
     """
-    cdef float64_t[:, ::1] distances
+    cdef {{INPUT_DTYPE_t}}[:, ::1] distances
     cdef intp_t[:, ::1] indices
 
     def __cinit__(self):
         # One-element arrays are used as placeholders to prevent
         # any problem due to potential access to those attributes
         # (e.g. assigning to NULL or a to value in another segment).
-        self.distances = np.zeros((1, 1), dtype=np.float64, order='C')
+        self.distances = np.zeros((1, 1), dtype={{INPUT_DTYPE}}, order='C')
         self.indices = np.zeros((1, 1), dtype=np.intp, order='C')
 
     def __init__(self, n_pts, n_nbrs):
         self.distances = np.full(
-            (n_pts, n_nbrs), np.inf, dtype=np.float64, order='C'
+            (n_pts, n_nbrs), np.inf, dtype={{INPUT_DTYPE}}, order='C'
         )
         self.indices = np.zeros((n_pts, n_nbrs), dtype=np.intp, order='C')
 
@@ -571,14 +595,16 @@ cdef class NeighborsHeap:
             )
         return 0
 
-# ------------------------------------------------------------
+{{endfor}}
+
+#------------------------------------------------------------
 # find_node_split_dim:
 #  this computes the equivalent of
 #  j_max = np.argmax(np.max(data, 0) - np.min(data, 0))
-cdef intp_t find_node_split_dim(float64_t* data,
-                                intp_t* node_indices,
-                                intp_t n_features,
-                                intp_t n_points) except -1:
+cdef intp_t find_node_split_dim(const floating* data,
+                                 intp_t* node_indices,
+                                 intp_t n_features,
+                                 intp_t n_points) except -1:
     """Find the dimension with the largest spread.
 
     Parameters
@@ -764,29 +790,31 @@ def newObj(obj):
     return obj.__new__(obj)
 
 
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}}
+
 ######################################################################
-# define the reverse mapping of VALID_METRICS
+# define the reverse mapping of VALID_METRICS{{name_suffix}}
 from sklearn.metrics._dist_metrics import get_valid_metric_ids
-VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS)
+VALID_METRIC_IDS{{name_suffix}} = get_valid_metric_ids(VALID_METRICS{{name_suffix}})
 
 
 ######################################################################
 # Binary Tree class
-cdef class BinaryTree:
+cdef class BinaryTree{{name_suffix}}:
 
-    cdef readonly const float64_t[:, ::1] data
-    cdef readonly const float64_t[::1] sample_weight
+    cdef readonly const {{INPUT_DTYPE_t}}[:, ::1] data
+    cdef readonly const {{INPUT_DTYPE_t}}[::1] sample_weight
     cdef public float64_t sum_weight
 
     cdef public const intp_t[::1] idx_array
     cdef public const NodeData_t[::1] node_data
-    cdef public const float64_t[:, :, ::1] node_bounds
+    cdef public const {{INPUT_DTYPE_t}}[:, :, ::1] node_bounds
 
     cdef intp_t leaf_size
     cdef intp_t n_levels
     cdef intp_t n_nodes
 
-    cdef DistanceMetric64 dist_metric
+    cdef DistanceMetric{{name_suffix}} dist_metric
     cdef int euclidean
 
     # variables to keep track of building & querying stats
@@ -795,7 +823,7 @@ cdef class BinaryTree:
     cdef int n_splits
     cdef int n_calls
 
-    valid_metrics = VALID_METRIC_IDS
+    valid_metrics = VALID_METRIC_IDS{{name_suffix}}
 
     # Use cinit to initialize all arrays to empty: this will prevent memory
     # errors and seg-faults in rare cases where __init__ is not called
@@ -803,11 +831,11 @@ cdef class BinaryTree:
     # any problem due to potential access to this attribute
     # (e.g. assigning to NULL or a to value in another segment).
     def __cinit__(self):
-        self.data = np.empty((1, 1), dtype=np.float64, order='C')
-        self.sample_weight = np.empty(1, dtype=np.float64, order='C')
+        self.data = np.empty((1, 1), dtype={{INPUT_DTYPE}}, order='C')
+        self.sample_weight = np.empty(1, dtype={{INPUT_DTYPE}}, order='C')
         self.idx_array = np.empty(1, dtype=np.intp, order='C')
         self.node_data = np.empty(1, dtype=NodeData, order='C')
-        self.node_bounds = np.empty((1, 1, 1), dtype=np.float64)
+        self.node_bounds = np.empty((1, 1, 1), dtype={{INPUT_DTYPE}})
 
         self.leaf_size = 0
         self.n_levels = 0
@@ -823,7 +851,7 @@ cdef class BinaryTree:
     def __init__(self, data,
                  leaf_size=40, metric='minkowski', sample_weight=None, **kwargs):
         # validate data
-        self.data = check_array(data, dtype=np.float64, order='C')
+        self.data = check_array(data, dtype={{INPUT_DTYPE}}, order='C')
         if self.data.size == 0:
             raise ValueError("X is an empty array")
 
@@ -834,15 +862,15 @@ cdef class BinaryTree:
             raise ValueError("leaf_size must be greater than or equal to 1")
         self.leaf_size = leaf_size
 
-        self.dist_metric = DistanceMetric.get_metric(metric, **kwargs)
+        self.dist_metric = DistanceMetric.get_metric(metric, dtype={{INPUT_DTYPE}}, **kwargs)
         self.euclidean = (self.dist_metric.__class__.__name__
-                          == 'EuclideanDistance64')
+                          == 'EuclideanDistance{{name_suffix}}')
 
         metric = self.dist_metric.__class__.__name__
-        if metric not in VALID_METRICS:
+        if metric not in VALID_METRICS{{name_suffix}}:
             raise ValueError('metric {metric} is not valid for '
                              '{BinaryTree}'.format(metric=metric,
-                                                   **DOC_DICT))
+                                                   **DOC_DICT{{name_suffix}}))
         self.dist_metric._validate_data(self.data)
 
         # determine number of levels in the tree, and from this
@@ -859,7 +887,7 @@ cdef class BinaryTree:
         self._update_sample_weight(n_samples, sample_weight)
 
         # Allocate tree-specific data
-        allocate_data(self, self.n_nodes, n_features)
+        allocate_data{{name_suffix}}(self, self.n_nodes, n_features)
         self._recursive_build(
             node_data=self.node_data.base,
             i_node=0,
@@ -870,7 +898,7 @@ cdef class BinaryTree:
     def _update_sample_weight(self, n_samples, sample_weight):
         if sample_weight is not None:
             self.sample_weight = np.asarray(
-                sample_weight, dtype=np.float64, order='C')
+                sample_weight, dtype={{INPUT_DTYPE}}, order='C')
             self.sum_weight = np.sum(self.sample_weight)
         else:
             self.sample_weight = None
@@ -982,17 +1010,17 @@ cdef class BinaryTree:
             self.node_bounds.base,
         )
 
-    cdef inline float64_t dist(self, float64_t* x1, float64_t* x2,
-                               intp_t size) except -1 nogil:
+    cdef inline float64_t dist(self, {{INPUT_DTYPE_t}}* x1, {{INPUT_DTYPE_t}}* x2,
+                             intp_t size) except -1 nogil:
         """Compute the distance between arrays x1 and x2"""
         self.n_calls += 1
         if self.euclidean:
-            return euclidean_dist64(x1, x2, size)
+            return euclidean_dist{{name_suffix}}(x1, x2, size)
         else:
             return self.dist_metric.dist(x1, x2, size)
 
-    cdef inline float64_t rdist(self, float64_t* x1, float64_t* x2,
-                                intp_t size) except -1 nogil:
+    cdef inline float64_t rdist(self, {{INPUT_DTYPE_t}}* x1, {{INPUT_DTYPE_t}}* x2,
+                              intp_t size) except -1 nogil:
         """Compute the reduced distance between arrays x1 and x2.
 
         The reduced distance, defined for some metrics, is a quantity which
@@ -1002,7 +1030,7 @@ cdef class BinaryTree:
         """
         self.n_calls += 1
         if self.euclidean:
-            return euclidean_rdist64(x1, x2, size)
+            return euclidean_rdist{{name_suffix}}(x1, x2, size)
         else:
             return self.dist_metric.rdist(x1, x2, size)
 
@@ -1023,10 +1051,10 @@ cdef class BinaryTree:
         cdef intp_t n_points = idx_end - idx_start
         cdef intp_t n_mid = n_points / 2
         cdef intp_t* idx_array = &self.idx_array[idx_start]
-        cdef float64_t* data = &self.data[0, 0]
+        cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
 
         # initialize node data
-        init_node(self, node_data, i_node, idx_start, idx_end)
+        init_node{{name_suffix}}(self, node_data, i_node, idx_start, idx_end)
 
         if 2 * i_node + 1 >= self.n_nodes:
             node_data[i_node].is_leaf = True
@@ -1103,7 +1131,7 @@ cdef class BinaryTree:
             corresponding point.
         """
         # XXX: we should allow X to be a pre-built tree.
-        X = check_array(X, dtype=np.float64, order='C')
+        X = check_array(X, dtype={{INPUT_DTYPE}}, order='C')
 
         if X.shape[X.ndim - 1] != self.data.shape[1]:
             raise ValueError("query data dimension must "
@@ -1115,13 +1143,13 @@ cdef class BinaryTree:
 
         # flatten X, and save original shape information
         np_Xarr = X.reshape((-1, self.data.shape[1]))
-        cdef const float64_t[:, ::1] Xarr = np_Xarr
+        cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr
         cdef float64_t reduced_dist_LB
         cdef intp_t i
-        cdef float64_t* pt
+        cdef {{INPUT_DTYPE_t}}* pt
 
         # initialize heap for neighbors
-        cdef NeighborsHeap heap = NeighborsHeap(Xarr.shape[0], k)
+        cdef NeighborsHeap{{name_suffix}} heap = NeighborsHeap{{name_suffix}}(Xarr.shape[0], k)
 
         # node heap for breadth-first queries
         cdef NodeHeap nodeheap
@@ -1141,7 +1169,7 @@ cdef class BinaryTree:
             if breadth_first:
                 self._query_dual_breadthfirst(other, heap, nodeheap)
             else:
-                reduced_dist_LB = min_rdist_dual(self, 0, other, 0)
+                reduced_dist_LB = min_rdist_dual{{name_suffix}}(self, 0, other, 0)
                 bounds = np.full(other.node_data.shape[0], np.inf)
                 self._query_dual_depthfirst(0, other, 0, bounds,
                                             heap, reduced_dist_LB)
@@ -1155,7 +1183,7 @@ cdef class BinaryTree:
             else:
                 with nogil:
                     for i in range(Xarr.shape[0]):
-                        reduced_dist_LB = min_rdist(self, 0, pt)
+                        reduced_dist_LB = min_rdist{{name_suffix}}(self, 0, pt)
                         self._query_single_depthfirst(0, pt, i, heap,
                                                       reduced_dist_LB)
                         pt += Xarr.shape[1]
@@ -1233,20 +1261,20 @@ cdef class BinaryTree:
 
         cdef intp_t i, count_i = 0
         cdef intp_t n_features = self.data.shape[1]
-        cdef float64_t[::1] dist_arr_i
+        cdef {{INPUT_DTYPE_t}}[::1] dist_arr_i
         cdef intp_t[::1] idx_arr_i, counts
-        cdef float64_t* pt
+        cdef {{INPUT_DTYPE_t}}* pt
         cdef intp_t** indices = NULL
-        cdef float64_t** distances = NULL
+        cdef {{INPUT_DTYPE_t}}** distances = NULL
 
         # validate X and prepare for query
-        X = check_array(X, dtype=np.float64, order='C')
+        X = check_array(X, dtype={{INPUT_DTYPE}}, order='C')
 
         if X.shape[X.ndim - 1] != self.data.shape[1]:
             raise ValueError("query data dimension must "
                              "match training data dimension")
 
-        cdef const float64_t[:, ::1] Xarr = X.reshape((-1, self.data.shape[1]))
+        cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = X.reshape((-1, self.data.shape[1]))
 
         # prepare r for query
         r = np.asarray(r, dtype=np.float64, order='C')
@@ -1265,7 +1293,7 @@ cdef class BinaryTree:
             if indices == NULL:
                 raise MemoryError()
             if return_distance:
-                distances = <float64_t**>calloc(Xarr.shape[0], sizeof(float64_t*))
+                distances = <{{INPUT_DTYPE_t}}**>calloc(Xarr.shape[0], sizeof({{INPUT_DTYPE_t}}*))
                 if distances == NULL:
                     free(indices)
                     raise MemoryError()
@@ -1273,7 +1301,7 @@ cdef class BinaryTree:
         np_idx_arr = np.zeros(self.data.shape[0], dtype=np.intp)
         idx_arr_i = np_idx_arr
 
-        np_dist_arr = np.zeros(self.data.shape[0], dtype=np.float64)
+        np_dist_arr = np.zeros(self.data.shape[0], dtype={{INPUT_DTYPE}})
         dist_arr_i = np_dist_arr
 
         counts_arr = np.zeros(Xarr.shape[0], dtype=np.intp)
@@ -1306,11 +1334,11 @@ cdef class BinaryTree:
 
                 if return_distance:
                     # equivalent to: distances[i] = np_dist_arr[:counts[i]].copy()
-                    distances[i] = <float64_t*>malloc(counts[i] * sizeof(float64_t))
+                    distances[i] = <{{INPUT_DTYPE_t}}*>malloc(counts[i] * sizeof({{INPUT_DTYPE_t}}))
                     if distances[i] == NULL:
                         memory_error = True
                         break
-                    memcpy(distances[i], &dist_arr_i[0], counts[i] * sizeof(float64_t))
+                    memcpy(distances[i], &dist_arr_i[0], counts[i] * sizeof({{INPUT_DTYPE_t}}))
 
         try:
             if memory_error:
@@ -1333,7 +1361,7 @@ cdef class BinaryTree:
 
                     # make a new numpy array that wraps the existing data
                     # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0
-                    distances_npy[i] = cnp.PyArray_SimpleNewFromData(1, <cnp.intp_t*>&counts[i], cnp.NPY_DOUBLE, distances[i])
+                    distances_npy[i] = cnp.PyArray_SimpleNewFromData(1, <cnp.intp_t*>&counts[i], {{NPY_TYPE}}, distances[i])
                     # make sure the data will be freed when the numpy array is garbage collected
                     PyArray_ENABLEFLAGS(distances_npy[i], cnp.NPY_ARRAY_OWNDATA)
                     # make sure the data is not freed twice
@@ -1445,18 +1473,18 @@ cdef class BinaryTree:
         cdef float64_t log_knorm = _log_kernel_norm(h_c, n_features, kernel_c)
 
         # validate X and prepare for query
-        X = check_array(X, dtype=np.float64, order='C')
+        X = check_array(X, dtype={{INPUT_DTYPE}}, order='C')
 
         if X.shape[X.ndim - 1] != n_features:
             raise ValueError("query data dimension must "
                              "match training data dimension")
         Xarr_np = X.reshape((-1, n_features))
-        cdef float64_t[:, ::1] Xarr = Xarr_np
+        cdef {{INPUT_DTYPE_t}}[:, ::1] Xarr = Xarr_np
 
-        log_density_arr = np.zeros(Xarr.shape[0], dtype=np.float64)
-        cdef float64_t[::1] log_density = log_density_arr
+        log_density_arr = np.zeros(Xarr.shape[0], dtype={{INPUT_DTYPE}})
+        cdef {{INPUT_DTYPE_t}}[::1] log_density = log_density_arr
 
-        cdef float64_t* pt = &Xarr[0, 0]
+        cdef {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0]
 
         cdef NodeHeap nodeheap
         if breadth_first:
@@ -1481,7 +1509,7 @@ cdef class BinaryTree:
                 pt += n_features
         else:
             for i in range(Xarr.shape[0]):
-                min_max_dist(self, 0, pt, &dist_LB, &dist_UB)
+                min_max_dist{{name_suffix}}(self, 0, pt, &dist_LB, &dist_UB)
                 # compute max & min bounds on density within top node
                 log_min_bound = (log(self.sum_weight) +
                                  compute_log_kernel(dist_UB,
@@ -1539,14 +1567,14 @@ cdef class BinaryTree:
         cdef intp_t i
 
         # validate X and prepare for query
-        X = check_array(X, dtype=np.float64, order='C')
+        X = check_array(X, dtype={{INPUT_DTYPE}}, order='C')
 
         if X.shape[X.ndim - 1] != self.data.shape[1]:
             raise ValueError("query data dimension must "
                              "match training data dimension")
 
         np_Xarr = X.reshape((-1, self.data.shape[1]))
-        cdef float64_t[:, ::1] Xarr = np_Xarr
+        cdef {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr
 
         # prepare r for query
         r = np.asarray(r, dtype=np.float64, order='C')
@@ -1561,7 +1589,7 @@ cdef class BinaryTree:
         count = np.zeros(r.shape[0], dtype=np.intp)
         cdef intp_t[::1] carr = count
 
-        cdef float64_t* pt = &Xarr[0, 0]
+        cdef {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0]
 
         if dualtree:
             other = self.__class__(Xarr, metric=self.dist_metric,
@@ -1576,17 +1604,21 @@ cdef class BinaryTree:
 
         return count
 
-    cdef int _query_single_depthfirst(self, intp_t i_node,
-                                      float64_t* pt, intp_t i_pt,
-                                      NeighborsHeap heap,
-                                      float64_t reduced_dist_LB) except -1 nogil:
+    cdef int _query_single_depthfirst(
+        self,
+        intp_t i_node,
+        {{INPUT_DTYPE_t}}* pt,
+        intp_t i_pt,
+        NeighborsHeap{{name_suffix}} heap,
+        float64_t reduced_dist_LB,
+    ) except -1 nogil:
         """Recursive Single-tree k-neighbors query, depth-first approach"""
         cdef NodeData_t node_info = self.node_data[i_node]
 
         cdef float64_t dist_pt, reduced_dist_LB_1, reduced_dist_LB_2
         cdef intp_t i, i1, i2
 
-        cdef float64_t* data = &self.data[0, 0]
+        cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
 
         # ------------------------------------------------------------
         # Case 1: query point is outside node radius:
@@ -1611,8 +1643,8 @@ cdef class BinaryTree:
             self.n_splits += 1
             i1 = 2 * i_node + 1
             i2 = i1 + 1
-            reduced_dist_LB_1 = min_rdist(self, i1, pt)
-            reduced_dist_LB_2 = min_rdist(self, i2, pt)
+            reduced_dist_LB_1 = min_rdist{{name_suffix}}(self, i1, pt)
+            reduced_dist_LB_2 = min_rdist{{name_suffix}}(self, i2, pt)
 
             # recursively query subnodes
             if reduced_dist_LB_1 <= reduced_dist_LB_2:
@@ -1627,19 +1659,22 @@ cdef class BinaryTree:
                                               reduced_dist_LB_1)
         return 0
 
-    cdef int _query_single_breadthfirst(self, float64_t* pt,
-                                        intp_t i_pt,
-                                        NeighborsHeap heap,
-                                        NodeHeap nodeheap) except -1:
+    cdef int _query_single_breadthfirst(
+        self,
+        {{INPUT_DTYPE_t}}* pt,
+        intp_t i_pt,
+        NeighborsHeap{{name_suffix}} heap,
+        NodeHeap nodeheap,
+    ) except -1:
         """Non-recursive single-tree k-neighbors query, breadth-first search"""
         cdef intp_t i, i_node
         cdef float64_t dist_pt, reduced_dist_LB
         cdef NodeData_t* node_data = &self.node_data[0]
-        cdef float64_t* data = &self.data[0, 0]
+        cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
 
         # Set up the node heap and push the head node onto it
         cdef NodeHeapData_t nodeheap_item
-        nodeheap_item.val = min_rdist(self, 0, pt)
+        nodeheap_item.val = min_rdist{{name_suffix}}(self, 0, pt)
         nodeheap_item.i1 = 0
         nodeheap.push(nodeheap_item)
 
@@ -1672,15 +1707,19 @@ cdef class BinaryTree:
                 self.n_splits += 1
                 for i in range(2 * i_node + 1, 2 * i_node + 3):
                     nodeheap_item.i1 = i
-                    nodeheap_item.val = min_rdist(self, i, pt)
+                    nodeheap_item.val = min_rdist{{name_suffix}}(self, i, pt)
                     nodeheap.push(nodeheap_item)
         return 0
 
-    cdef int _query_dual_depthfirst(self, intp_t i_node1,
-                                    BinaryTree other, intp_t i_node2,
-                                    float64_t[::1] bounds,
-                                    NeighborsHeap heap,
-                                    float64_t reduced_dist_LB) except -1:
+    cdef int _query_dual_depthfirst(
+        self,
+        intp_t i_node1,
+        BinaryTree{{name_suffix}} other,
+        intp_t i_node2,
+        float64_t[::1] bounds,
+        NeighborsHeap{{name_suffix}} heap,
+        float64_t reduced_dist_LB,
+    ) except -1:
         """Recursive dual-tree k-neighbors query, depth-first"""
         # note that the array `bounds` is maintained such that
         # bounds[i] is the largest distance among any of the
@@ -1688,8 +1727,8 @@ cdef class BinaryTree:
         cdef NodeData_t node_info1 = self.node_data[i_node1]
         cdef NodeData_t node_info2 = other.node_data[i_node2]
 
-        cdef float64_t* data1 = &self.data[0, 0]
-        cdef float64_t* data2 = &other.data[0, 0]
+        cdef {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0]
+        cdef {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0]
         cdef intp_t n_features = self.data.shape[1]
 
         cdef float64_t bound_max, dist_pt, reduced_dist_LB1, reduced_dist_LB2
@@ -1740,9 +1779,9 @@ cdef class BinaryTree:
         #          recursively query, starting with the nearest subnode
         elif node_info1.is_leaf or (not node_info2.is_leaf
                                     and node_info2.radius > node_info1.radius):
-            reduced_dist_LB1 = min_rdist_dual(self, i_node1,
+            reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, i_node1,
                                               other, 2 * i_node2 + 1)
-            reduced_dist_LB2 = min_rdist_dual(self, i_node1,
+            reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, i_node1,
                                               other, 2 * i_node2 + 2)
 
             if reduced_dist_LB1 < reduced_dist_LB2:
@@ -1760,9 +1799,9 @@ cdef class BinaryTree:
         # Case 3b: node 2 is a leaf or is smaller: split node 1 and
         #          recursively query, starting with the nearest subnode
         else:
-            reduced_dist_LB1 = min_rdist_dual(self, 2 * i_node1 + 1,
+            reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 1,
                                               other, i_node2)
-            reduced_dist_LB2 = min_rdist_dual(self, 2 * i_node1 + 2,
+            reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 2,
                                               other, i_node2)
 
             if reduced_dist_LB1 < reduced_dist_LB2:
@@ -1777,9 +1816,12 @@ cdef class BinaryTree:
                                             bounds, heap, reduced_dist_LB1)
         return 0
 
-    cdef int _query_dual_breadthfirst(self, BinaryTree other,
-                                      NeighborsHeap heap,
-                                      NodeHeap nodeheap) except -1:
+    cdef int _query_dual_breadthfirst(
+        self,
+        BinaryTree{{name_suffix}} other,
+        NeighborsHeap{{name_suffix}} heap,
+        NodeHeap nodeheap,
+    ) except -1:
         """Non-recursive dual-tree k-neighbors query, breadth-first"""
         cdef intp_t i, i1, i2, i_node1, i_node2, i_pt
         cdef float64_t dist_pt, reduced_dist_LB
@@ -1787,13 +1829,13 @@ cdef class BinaryTree:
         cdef NodeData_t* node_data1 = &self.node_data[0]
         cdef NodeData_t* node_data2 = &other.node_data[0]
         cdef NodeData_t node_info1, node_info2
-        cdef float64_t* data1 = &self.data[0, 0]
-        cdef float64_t* data2 = &other.data[0, 0]
+        cdef {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0]
+        cdef {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0]
         cdef intp_t n_features = self.data.shape[1]
 
         # Set up the node heap and push the head nodes onto it
         cdef NodeHeapData_t nodeheap_item
-        nodeheap_item.val = min_rdist_dual(self, 0, other, 0)
+        nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, 0, other, 0)
         nodeheap_item.i1 = 0
         nodeheap_item.i2 = 0
         nodeheap.push(nodeheap_item)
@@ -1845,7 +1887,7 @@ cdef class BinaryTree:
                 nodeheap_item.i1 = i_node1
                 for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3):
                     nodeheap_item.i2 = i2
-                    nodeheap_item.val = min_rdist_dual(self, i_node1,
+                    nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i_node1,
                                                        other, i2)
                     nodeheap.push(nodeheap_item)
 
@@ -1856,21 +1898,24 @@ cdef class BinaryTree:
                 nodeheap_item.i2 = i_node2
                 for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3):
                     nodeheap_item.i1 = i1
-                    nodeheap_item.val = min_rdist_dual(self, i1,
+                    nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i1,
                                                        other, i_node2)
                     nodeheap.push(nodeheap_item)
         return 0
 
-    cdef intp_t _query_radius_single(self,
-                                     intp_t i_node,
-                                     float64_t* pt, float64_t r,
-                                     intp_t* indices,
-                                     float64_t* distances,
-                                     intp_t count,
-                                     int count_only,
-                                     int return_distance) noexcept nogil:
+    cdef intp_t _query_radius_single(
+        self,
+        intp_t i_node,
+        {{INPUT_DTYPE_t}}* pt,
+        float64_t r,
+        intp_t* indices,
+        {{INPUT_DTYPE_t}}* distances,
+        intp_t count,
+        int count_only,
+        int return_distance,
+    ) noexcept nogil:
         """recursive single-tree radius query, depth-first"""
-        cdef float64_t* data = &self.data[0, 0]
+        cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
         cdef intp_t* idx_array = &self.idx_array[0]
         cdef intp_t n_features = self.data.shape[1]
         cdef NodeData_t node_info = self.node_data[i_node]
@@ -1879,7 +1924,7 @@ cdef class BinaryTree:
         cdef float64_t reduced_r
 
         cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0
-        min_max_dist(self, i_node, pt, &dist_LB, &dist_UB)
+        min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB)
 
         # ------------------------------------------------------------
         # Case 1: all node points are outside distance r.
@@ -1937,13 +1982,17 @@ cdef class BinaryTree:
 
         return count
 
-    cdef float64_t _kde_single_breadthfirst(self, float64_t* pt,
-                                            KernelType kernel, float64_t h,
-                                            float64_t log_knorm,
-                                            float64_t log_atol, float64_t log_rtol,
-                                            NodeHeap nodeheap,
-                                            float64_t* node_log_min_bounds,
-                                            float64_t* node_log_bound_spreads):
+    cdef float64_t _kde_single_breadthfirst(
+        self, {{INPUT_DTYPE_t}}* pt,
+        KernelType kernel,
+        float64_t h,
+        float64_t log_knorm,
+        float64_t log_atol,
+        float64_t log_rtol,
+        NodeHeap nodeheap,
+        float64_t* node_log_min_bounds,
+        float64_t* node_log_bound_spreads,
+    ):
         """non-recursive single-tree kernel density estimation"""
         # For the given point, node_log_min_bounds and node_log_bound_spreads
         # will encode the current bounds on the density between the point
@@ -1957,9 +2006,9 @@ cdef class BinaryTree:
         cdef float64_t global_log_min_bound, global_log_bound_spread
         cdef float64_t global_log_max_bound
 
-        cdef float64_t* data = &self.data[0, 0]
+        cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
         cdef bint with_sample_weight = self.sample_weight is not None
-        cdef float64_t* sample_weight
+        cdef {{INPUT_DTYPE_t}}* sample_weight
         if with_sample_weight:
             sample_weight = &self.sample_weight[0]
         cdef intp_t* idx_array = &self.idx_array[0]
@@ -1981,13 +2030,13 @@ cdef class BinaryTree:
 
         # push the top node to the heap
         cdef NodeHeapData_t nodeheap_item
-        nodeheap_item.val = min_dist(self, 0, pt)
+        nodeheap_item.val = min_dist{{name_suffix}}(self, 0, pt)
         nodeheap_item.i1 = 0
         nodeheap.push(nodeheap_item)
 
-        global_log_min_bound = log(N) + compute_log_kernel(max_dist(self,
-                                                                    0, pt),
-                                                           h, kernel)
+        global_log_min_bound = log(N) + compute_log_kernel(
+            max_dist{{name_suffix}}(self, 0, pt), h, kernel
+        )
         global_log_max_bound = log(N) + compute_log_kernel(nodeheap_item.val,
                                                            h, kernel)
         global_log_bound_spread = logsubexp(global_log_max_bound,
@@ -2056,8 +2105,8 @@ cdef class BinaryTree:
                     N1 = node_data[i1].idx_end - node_data[i1].idx_start
                     N2 = node_data[i2].idx_end - node_data[i2].idx_start
 
-                min_max_dist(self, i1, pt, &dist_LB_1, &dist_UB_1)
-                min_max_dist(self, i2, pt, &dist_LB_2, &dist_UB_2)
+                min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB_1, &dist_UB_1)
+                min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB_2, &dist_UB_2)
 
                 node_log_min_bounds[i1] = (log(N1) +
                                            compute_log_kernel(dist_UB_1,
@@ -2102,14 +2151,19 @@ cdef class BinaryTree:
                          global_log_bound_spread - log(2))
 
     cdef int _kde_single_depthfirst(
-                   self, intp_t i_node, float64_t* pt,
-                   KernelType kernel, float64_t h,
-                   float64_t log_knorm,
-                   float64_t log_atol, float64_t log_rtol,
-                   float64_t local_log_min_bound,
-                   float64_t local_log_bound_spread,
-                   float64_t* global_log_min_bound,
-                   float64_t* global_log_bound_spread) except -1:
+        self,
+        intp_t i_node,
+        {{INPUT_DTYPE_t}}* pt,
+        KernelType kernel,
+        float64_t h,
+        float64_t log_knorm,
+        float64_t log_atol,
+        float64_t log_rtol,
+        float64_t local_log_min_bound,
+        float64_t local_log_bound_spread,
+        float64_t* global_log_min_bound,
+        float64_t* global_log_bound_spread,
+    ) except -1:
         """recursive single-tree kernel density estimate, depth-first"""
         # For the given point, local_min_bound and local_max_bound give the
         # minimum and maximum density for the current node, while
@@ -2119,10 +2173,10 @@ cdef class BinaryTree:
         cdef intp_t i, i1, i2, iw, start, end
         cdef float64_t N1, N2
 
-        cdef float64_t* data = &self.data[0, 0]
+        cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
         cdef NodeData_t* node_data = &self.node_data[0]
         cdef bint with_sample_weight = self.sample_weight is not None
-        cdef float64_t* sample_weight
+        cdef {{INPUT_DTYPE_t}}* sample_weight
         cdef float64_t log_weight
         if with_sample_weight:
             sample_weight = &self.sample_weight[0]
@@ -2194,7 +2248,7 @@ cdef class BinaryTree:
                 N1 = <float64_t>(self.node_data[i1].idx_end - self.node_data[i1].idx_start)
                 N2 = <float64_t>(self.node_data[i2].idx_end - self.node_data[i2].idx_start)
 
-            min_max_dist(self, i1, pt, &dist_LB, &dist_UB)
+            min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB, &dist_UB)
             child1_log_min_bound = log(N1) + compute_log_kernel(dist_UB, h,
                                                                 kernel)
             child1_log_bound_spread = logsubexp(log(N1) +
@@ -2202,7 +2256,7 @@ cdef class BinaryTree:
                                                                    kernel),
                                                 child1_log_min_bound)
 
-            min_max_dist(self, i2, pt, &dist_LB, &dist_UB)
+            min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB, &dist_UB)
             child2_log_min_bound = log(N2) + compute_log_kernel(dist_UB, h,
                                                                 kernel)
             child2_log_bound_spread = logsubexp(log(N2) +
@@ -2238,11 +2292,17 @@ cdef class BinaryTree:
                                         global_log_bound_spread)
         return 0
 
-    cdef int _two_point_single(self, intp_t i_node, float64_t* pt, float64_t* r,
-                               intp_t* count, intp_t i_min,
-                               intp_t i_max) except -1:
+    cdef int _two_point_single(
+        self,
+        intp_t i_node,
+        {{INPUT_DTYPE_t}}* pt,
+        float64_t* r,
+        intp_t* count,
+        intp_t i_min,
+        intp_t i_max,
+    ) except -1:
         """recursive single-tree two-point correlation function query"""
-        cdef float64_t* data = &self.data[0, 0]
+        cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0]
         cdef intp_t* idx_array = &self.idx_array[0]
         cdef intp_t n_features = self.data.shape[1]
         cdef NodeData_t node_info = self.node_data[i_node]
@@ -2251,7 +2311,7 @@ cdef class BinaryTree:
         cdef float64_t reduced_r
 
         cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0
-        min_max_dist(self, i_node, pt, &dist_LB, &dist_UB)
+        min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB)
 
         # ------------------------------------------------------------
         # Go through bounds and check for cuts
@@ -2287,13 +2347,19 @@ cdef class BinaryTree:
                                        count, i_min, i_max)
         return 0
 
-    cdef int _two_point_dual(self, intp_t i_node1,
-                             BinaryTree other, intp_t i_node2,
-                             float64_t* r, intp_t* count,
-                             intp_t i_min, intp_t i_max) except -1:
+    cdef int _two_point_dual(
+        self,
+        intp_t i_node1,
+        BinaryTree{{name_suffix}} other,
+        intp_t i_node2,
+        float64_t* r,
+        intp_t* count,
+        intp_t i_min,
+        intp_t i_max,
+    ) except -1:
         """recursive dual-tree two-point correlation function query"""
-        cdef float64_t* data1 = &self.data[0, 0]
-        cdef float64_t* data2 = &other.data[0, 0]
+        cdef {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0]
+        cdef {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0]
         cdef intp_t* idx_array1 = &self.idx_array[0]
         cdef intp_t* idx_array2 = &other.idx_array[0]
         cdef NodeData_t node_info1 = self.node_data[i_node1]
@@ -2305,8 +2371,8 @@ cdef class BinaryTree:
         cdef float64_t reduced_r
 
         cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0
-        dist_LB = min_dist_dual(self, i_node1, other, i_node2)
-        dist_UB = max_dist_dual(self, i_node1, other, i_node2)
+        dist_LB = min_dist_dual{{name_suffix}}(self, i_node1, other, i_node2)
+        dist_UB = max_dist_dual{{name_suffix}}(self, i_node1, other, i_node2)
 
         # ------------------------------------------------------------
         # Go through bounds and check for cuts
@@ -2359,21 +2425,11 @@ cdef class BinaryTree:
                                              r, count, i_min, i_max)
         return 0
 
+{{endfor}}
 
 ######################################################################
 # Python functions for benchmarking and testing C implementations
 
-def load_heap(float64_t[:, ::1] X, intp_t k):
-    """test fully loading the heap"""
-    assert k <= X.shape[1]
-    cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k)
-    cdef intp_t i, j
-    for i in range(X.shape[0]):
-        for j in range(X.shape[1]):
-            heap._push(i, X[i, j], j)
-    return heap.get_arrays()
-
-
 def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices):
     """In-place simultaneous sort the given row of the arrays
 
@@ -2412,10 +2468,12 @@ def nodeheap_sort(float64_t[::1] vals):
     return np.asarray(vals_sorted), np.asarray(indices)
 
 
-cdef inline float64_t _total_node_weight(NodeData_t* node_data,
-                                         float64_t* sample_weight,
-                                         intp_t* idx_array,
-                                         intp_t i_node):
+cdef inline float64_t _total_node_weight(
+    NodeData_t* node_data,
+    const floating* sample_weight,
+    intp_t* idx_array,
+    intp_t i_node,
+):
     cdef intp_t i
     cdef float64_t N = 0.0
     for i in range(node_data[i_node].idx_start, node_data[i_node].idx_end):
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index a9b78d6e499c9..0f5bd1439f81c 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -329,8 +329,8 @@ def predict_proba(self, X):
                     self._fit_X,
                     k=self.n_neighbors,
                     weights=self.weights,
-                    labels=self._y,
-                    unique_labels=self.classes_,
+                    Y_labels=self._y,
+                    unique_Y_labels=self.classes_,
                     metric=metric,
                     metric_kwargs=metric_kwargs,
                     # `strategy="parallel_on_X"` has in practice be shown
diff --git a/sklearn/neighbors/_kd_tree.pyx b/sklearn/neighbors/_kd_tree.pyx.tp
similarity index 65%
rename from sklearn/neighbors/_kd_tree.pyx
rename to sklearn/neighbors/_kd_tree.pyx.tp
index f5cd2617be147..1006ec2a8398c 100644
--- a/sklearn/neighbors/_kd_tree.pyx
+++ b/sklearn/neighbors/_kd_tree.pyx.tp
@@ -1,22 +1,52 @@
+{{py:
+
+# Generated file: _kd_tree.pyx
+
+implementation_specific_values = [
+    # The values are arranged as follows:
+    #
+    #       name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
+    #
+    ('64', 'float64_t', 'np.float64'),
+    ('32', 'float32_t', 'np.float32')
+]
+
 # By Jake Vanderplas (2013) <jakevdp@cs.washington.edu>
 # written for the scikit-learn project
 # License: BSD
 
-__all__ = ['KDTree']
+}}
+
 
-DOC_DICT = {'BinaryTree': 'KDTree', 'binary_tree': 'kd_tree'}
+__all__ = ['KDTree', 'KDTree64', 'KDTree32']
 
-VALID_METRICS = ['EuclideanDistance64', 'ManhattanDistance64',
-                 'ChebyshevDistance64', 'MinkowskiDistance64']
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
+DOC_DICT{{name_suffix}} = {
+    'BinaryTree': 'KDTree{{name_suffix}}',
+    'binary_tree': 'kd_tree{{name_suffix}}',
+}
+
+VALID_METRICS{{name_suffix}} = [
+    'EuclideanDistance{{name_suffix}}',
+    'ManhattanDistance{{name_suffix}}',
+    'ChebyshevDistance{{name_suffix}}',
+    'MinkowskiDistance{{name_suffix}}'
+]
+
+{{endfor}}
 
 include "_binary_tree.pxi"
 
-# Inherit KDTree from BinaryTree
-cdef class KDTree(BinaryTree):
-    __doc__ = CLASS_DOC.format(**DOC_DICT)
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+# Inherit KDTree{{name_suffix}} from BinaryTree{{name_suffix}}
+cdef class KDTree{{name_suffix}}(BinaryTree{{name_suffix}}):
+    __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}})
     pass
 
+{{endfor}}
+
 
 # ----------------------------------------------------------------------
 # The functions below specialized the Binary Tree as a KD Tree
@@ -28,27 +58,36 @@ cdef class KDTree(BinaryTree):
 #   distance for the Euclidean metric is the squared-euclidean distance.
 #   For some metrics, the reduced distance is simply the distance.
 
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
-cdef int allocate_data(BinaryTree tree, intp_t n_nodes,
-                       intp_t n_features) except -1:
+cdef int allocate_data{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t n_nodes,
+    intp_t n_features,
+) except -1:
     """Allocate arrays needed for the KD Tree"""
-    tree.node_bounds = np.zeros((2, n_nodes, n_features), dtype=np.float64)
+    tree.node_bounds = np.zeros((2, n_nodes, n_features), dtype={{INPUT_DTYPE}})
     return 0
 
 
-cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, intp_t i_node,
-                   intp_t idx_start, intp_t idx_end) except -1:
+cdef int init_node{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    NodeData_t[::1] node_data,
+    intp_t i_node,
+    intp_t idx_start,
+    intp_t idx_end,
+) except -1:
     """Initialize the node for the dataset stored in tree.data"""
     cdef intp_t n_features = tree.data.shape[1]
     cdef intp_t i, j
     cdef float64_t rad = 0
 
-    cdef float64_t* lower_bounds = &tree.node_bounds[0, i_node, 0]
-    cdef float64_t* upper_bounds = &tree.node_bounds[1, i_node, 0]
-    cdef float64_t* data = &tree.data[0, 0]
+    cdef {{INPUT_DTYPE_t}}* lower_bounds = &tree.node_bounds[0, i_node, 0]
+    cdef {{INPUT_DTYPE_t}}* upper_bounds = &tree.node_bounds[1, i_node, 0]
+    cdef {{INPUT_DTYPE_t}}* data = &tree.data[0, 0]
     cdef intp_t* idx_array = &tree.idx_array[0]
 
-    cdef float64_t* data_row
+    cdef {{INPUT_DTYPE_t}}* data_row
 
     # determine Node bounds
     for j in range(n_features):
@@ -81,8 +120,11 @@ cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, intp_t i_node,
     return 0
 
 
-cdef float64_t min_rdist(BinaryTree tree, intp_t i_node,
-                         float64_t* pt) except -1 nogil:
+cdef float64_t min_rdist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    {{INPUT_DTYPE_t}}* pt,
+) except -1 nogil:
     """Compute the minimum reduced-distance between a point and a node"""
     cdef intp_t n_features = tree.data.shape[1]
     cdef float64_t d, d_lo, d_hi, rdist=0.0
@@ -105,16 +147,26 @@ cdef float64_t min_rdist(BinaryTree tree, intp_t i_node,
     return rdist
 
 
-cdef float64_t min_dist(BinaryTree tree, intp_t i_node, float64_t* pt) except -1:
+cdef float64_t min_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    {{INPUT_DTYPE_t}}* pt,
+) except -1:
     """Compute the minimum distance between a point and a node"""
     if tree.dist_metric.p == INF:
-        return min_rdist(tree, i_node, pt)
+        return min_rdist{{name_suffix}}(tree, i_node, pt)
     else:
-        return pow(min_rdist(tree, i_node, pt), 1. / tree.dist_metric.p)
+        return pow(
+            min_rdist{{name_suffix}}(tree, i_node, pt),
+            1. / tree.dist_metric.p
+        )
 
 
-cdef float64_t max_rdist(BinaryTree tree,
-                         intp_t i_node, float64_t* pt) except -1:
+cdef float64_t max_rdist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    {{INPUT_DTYPE_t}}* pt,
+) except -1:
     """Compute the maximum reduced-distance between a point and a node"""
     cdef intp_t n_features = tree.data.shape[1]
 
@@ -134,16 +186,28 @@ cdef float64_t max_rdist(BinaryTree tree,
     return rdist
 
 
-cdef float64_t max_dist(BinaryTree tree, intp_t i_node, float64_t* pt) except -1:
+cdef float64_t max_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    {{INPUT_DTYPE_t}}* pt,
+) except -1:
     """Compute the maximum distance between a point and a node"""
     if tree.dist_metric.p == INF:
-        return max_rdist(tree, i_node, pt)
+        return max_rdist{{name_suffix}}(tree, i_node, pt)
     else:
-        return pow(max_rdist(tree, i_node, pt), 1. / tree.dist_metric.p)
-
-
-cdef inline int min_max_dist(BinaryTree tree, intp_t i_node, float64_t* pt,
-                             float64_t* min_dist, float64_t* max_dist) except -1 nogil:
+        return pow(
+            max_rdist{{name_suffix}}(tree, i_node, pt),
+            1. / tree.dist_metric.p
+        )
+
+
+cdef inline int min_max_dist{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree,
+    intp_t i_node,
+    {{INPUT_DTYPE_t}}* pt,
+    float64_t* min_dist,
+    float64_t* max_dist,
+) except -1 nogil:
     """Compute the minimum and maximum distance between a point and a node"""
     cdef intp_t n_features = tree.data.shape[1]
 
@@ -177,8 +241,12 @@ cdef inline int min_max_dist(BinaryTree tree, intp_t i_node, float64_t* pt,
     return 0
 
 
-cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1,
-                                     BinaryTree tree2, intp_t i_node2) except -1:
+cdef inline float64_t min_rdist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
     """Compute the minimum reduced distance between two nodes"""
     cdef intp_t n_features = tree1.data.shape[1]
 
@@ -208,15 +276,24 @@ cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1,
     return rdist
 
 
-cdef inline float64_t min_dist_dual(BinaryTree tree1, intp_t i_node1,
-                                    BinaryTree tree2, intp_t i_node2) except -1:
+cdef inline float64_t min_dist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
     """Compute the minimum distance between two nodes"""
-    return tree1.dist_metric._rdist_to_dist(min_rdist_dual(tree1, i_node1,
-                                                           tree2, i_node2))
+    return tree1.dist_metric._rdist_to_dist(
+        min_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+    )
 
 
-cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1,
-                                     BinaryTree tree2, intp_t i_node2) except -1:
+cdef inline float64_t max_rdist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
     """Compute the maximum reduced distance between two nodes"""
     cdef intp_t n_features = tree1.data.shape[1]
 
@@ -240,8 +317,20 @@ cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1,
     return rdist
 
 
-cdef inline float64_t max_dist_dual(BinaryTree tree1, intp_t i_node1,
-                                    BinaryTree tree2, intp_t i_node2) except -1:
+cdef inline float64_t max_dist_dual{{name_suffix}}(
+    BinaryTree{{name_suffix}} tree1,
+    intp_t i_node1,
+    BinaryTree{{name_suffix}} tree2,
+    intp_t i_node2,
+) except -1:
     """Compute the maximum distance between two nodes"""
-    return tree1.dist_metric._rdist_to_dist(max_rdist_dual(tree1, i_node1,
-                                                           tree2, i_node2))
+    return tree1.dist_metric._rdist_to_dist(
+        max_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
+    )
+
+{{endfor}}
+
+
+class KDTree(KDTree64):
+    __doc__ = CLASS_DOC.format(BinaryTree="KDTree")
+    pass
diff --git a/sklearn/neighbors/_partition_nodes.pxd b/sklearn/neighbors/_partition_nodes.pxd
index 927fde873ee58..c6a0d4bb975c2 100644
--- a/sklearn/neighbors/_partition_nodes.pxd
+++ b/sklearn/neighbors/_partition_nodes.pxd
@@ -1,7 +1,8 @@
+from cython cimport floating
 from ..utils._typedefs cimport float64_t, intp_t
 
 cdef int partition_node_indices(
-        float64_t *data,
+        floating *data,
         intp_t *node_indices,
         intp_t split_dim,
         intp_t split_index,
diff --git a/sklearn/neighbors/_partition_nodes.pyx b/sklearn/neighbors/_partition_nodes.pyx
index d293b765ea279..011b024fccb14 100644
--- a/sklearn/neighbors/_partition_nodes.pyx
+++ b/sklearn/neighbors/_partition_nodes.pyx
@@ -16,6 +16,8 @@
 #  - https://en.cppreference.com/w/cpp/algorithm/nth_element.
 #  - https://github.com/scikit-learn/scikit-learn/pull/11103
 #  - https://github.com/scikit-learn/scikit-learn/pull/19473
+from cython cimport floating
+
 
 cdef extern from *:
     """
@@ -63,7 +65,7 @@ cdef extern from *:
 
 
 cdef int partition_node_indices(
-        float64_t *data,
+        floating *data,
         intp_t *node_indices,
         intp_t split_dim,
         intp_t split_index,
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index b9b7f4030d02c..2897c1ce409e8 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -15,6 +15,7 @@
 import numpy as np
 
 from ..base import RegressorMixin, _fit_context
+from ..metrics import DistanceMetric
 from ..utils._param_validation import StrOptions
 from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin, _get_weights
 
@@ -71,7 +72,7 @@ class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):
         equivalent to using manhattan_distance (l1), and euclidean_distance
         (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
 
-    metric : str or callable, default='minkowski'
+    metric : str, DistanceMetric object or callable, default='minkowski'
         Metric to use for distance computation. Default is "minkowski", which
         results in the standard Euclidean distance when p = 2. See the
         documentation of `scipy.spatial.distance
@@ -89,6 +90,9 @@ class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):
         between those vectors. This works for Scipy's metrics, but is less
         efficient than passing the metric name as a string.
 
+        If metric is a DistanceMetric object, it will be passed directly to
+        the underlying computation routines.
+
     metric_params : dict, default=None
         Additional keyword arguments for the metric function.
 
@@ -164,6 +168,7 @@ class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):
         **NeighborsBase._parameter_constraints,
         "weights": [StrOptions({"uniform", "distance"}), callable, None],
     }
+    _parameter_constraints["metric"].append(DistanceMetric)
     _parameter_constraints.pop("radius")
 
     def __init__(
diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py
index efca4e491ce01..5263f201f320b 100644
--- a/sklearn/neighbors/tests/test_ball_tree.py
+++ b/sklearn/neighbors/tests/test_ball_tree.py
@@ -2,9 +2,9 @@
 
 import numpy as np
 import pytest
-from numpy.testing import assert_array_almost_equal
+from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal
 
-from sklearn.neighbors._ball_tree import BallTree
+from sklearn.neighbors._ball_tree import BallTree, BallTree32, BallTree64
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import _convert_container
 from sklearn.utils.validation import check_array
@@ -15,6 +15,13 @@
 
 DIMENSION = 3
 
+METRICS = {
+    "euclidean": {},
+    "manhattan": {},
+    "minkowski": dict(p=3),
+    "chebyshev": {},
+}
+
 DISCRETE_METRICS = ["hamming", "canberra", "braycurtis"]
 
 BOOLEAN_METRICS = [
@@ -26,6 +33,11 @@
     "sokalsneath",
 ]
 
+BALL_TREE_CLASSES = [
+    BallTree64,
+    BallTree32,
+]
+
 
 def brute_force_neighbors(X, Y, k, metric, **kwargs):
     from sklearn.metrics import DistanceMetric
@@ -37,9 +49,14 @@ def brute_force_neighbors(X, Y, k, metric, **kwargs):
     return dist, ind
 
 
+def test_BallTree_is_BallTree64_subclass():
+    assert issubclass(BallTree, BallTree64)
+
+
 @pytest.mark.parametrize("metric", itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS))
 @pytest.mark.parametrize("array_type", ["list", "array"])
-def test_ball_tree_query_metrics(metric, array_type):
+@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES)
+def test_ball_tree_query_metrics(metric, array_type, BallTreeImplementation):
     rng = check_random_state(0)
     if metric in BOOLEAN_METRICS:
         X = rng.random_sample((40, 10)).round(0)
@@ -52,31 +69,36 @@ def test_ball_tree_query_metrics(metric, array_type):
 
     k = 5
 
-    bt = BallTree(X, leaf_size=1, metric=metric)
+    bt = BallTreeImplementation(X, leaf_size=1, metric=metric)
     dist1, ind1 = bt.query(Y, k)
     dist2, ind2 = brute_force_neighbors(X, Y, k, metric)
     assert_array_almost_equal(dist1, dist2)
 
 
-def test_query_haversine():
+@pytest.mark.parametrize(
+    "BallTreeImplementation, decimal_tol", zip(BALL_TREE_CLASSES, [6, 5])
+)
+def test_query_haversine(BallTreeImplementation, decimal_tol):
     rng = check_random_state(0)
     X = 2 * np.pi * rng.random_sample((40, 2))
-    bt = BallTree(X, leaf_size=1, metric="haversine")
+    bt = BallTreeImplementation(X, leaf_size=1, metric="haversine")
     dist1, ind1 = bt.query(X, k=5)
     dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine")
 
-    assert_array_almost_equal(dist1, dist2)
+    assert_array_almost_equal(dist1, dist2, decimal=decimal_tol)
     assert_array_almost_equal(ind1, ind2)
 
 
-def test_array_object_type():
+@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES)
+def test_array_object_type(BallTreeImplementation):
     """Check that we do not accept object dtype array."""
     X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
     with pytest.raises(ValueError, match="setting an array element with a sequence"):
-        BallTree(X)
+        BallTreeImplementation(X)
 
 
-def test_bad_pyfunc_metric():
+@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES)
+def test_bad_pyfunc_metric(BallTreeImplementation):
     def wrong_returned_value(x, y):
         return "1"
 
@@ -86,8 +108,93 @@ def one_arg_func(x):
     X = np.ones((5, 2))
     msg = "Custom distance function must accept two vectors and return a float."
     with pytest.raises(TypeError, match=msg):
-        BallTree(X, metric=wrong_returned_value)
+        BallTreeImplementation(X, metric=wrong_returned_value)
 
     msg = "takes 1 positional argument but 2 were given"
     with pytest.raises(TypeError, match=msg):
-        BallTree(X, metric=one_arg_func)
+        BallTreeImplementation(X, metric=one_arg_func)
+
+
+@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS))
+def test_ball_tree_numerical_consistency(global_random_seed, metric):
+    # Results on float64 and float32 versions of a dataset must be
+    # numerically close.
+    X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(
+        random_seed=global_random_seed, features=50
+    )
+
+    metric_params = METRICS.get(metric, {})
+    bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params)
+    bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params)
+
+    # Test consistency with respect to the `query` method
+    k = 5
+    dist_64, ind_64 = bt_64.query(Y_64, k=k)
+    dist_32, ind_32 = bt_32.query(Y_32, k=k)
+    assert_allclose(dist_64, dist_32, rtol=1e-5)
+    assert_equal(ind_64, ind_32)
+    assert dist_64.dtype == np.float64
+    assert dist_32.dtype == np.float32
+
+    # Test consistency with respect to the `query_radius` method
+    r = 2.38
+    ind_64 = bt_64.query_radius(Y_64, r=r)
+    ind_32 = bt_32.query_radius(Y_32, r=r)
+    for _ind64, _ind32 in zip(ind_64, ind_32):
+        assert_equal(_ind64, _ind32)
+
+    # Test consistency with respect to the `query_radius` method
+    # with return distances being true
+    ind_64, dist_64 = bt_64.query_radius(Y_64, r=r, return_distance=True)
+    ind_32, dist_32 = bt_32.query_radius(Y_32, r=r, return_distance=True)
+    for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32):
+        assert_equal(_ind64, _ind32)
+        assert_allclose(_dist_64, _dist_32, rtol=1e-5)
+        assert _dist_64.dtype == np.float64
+        assert _dist_32.dtype == np.float32
+
+
+@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS))
+def test_kernel_density_numerical_consistency(global_random_seed, metric):
+    # Test consistency with respect to the `kernel_density` method
+    X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed)
+
+    metric_params = METRICS.get(metric, {})
+    bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params)
+    bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params)
+
+    kernel = "gaussian"
+    h = 0.1
+    density64 = bt_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True)
+    density32 = bt_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True)
+    assert_allclose(density64, density32, rtol=1e-5)
+    assert density64.dtype == np.float64
+    assert density32.dtype == np.float32
+
+
+def test_two_point_correlation_numerical_consistency(global_random_seed):
+    # Test consistency with respect to the `two_point_correlation` method
+    X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed)
+
+    bt_64 = BallTree64(X_64, leaf_size=10)
+    bt_32 = BallTree32(X_32, leaf_size=10)
+
+    r = np.linspace(0, 1, 10)
+
+    counts_64 = bt_64.two_point_correlation(Y_64, r=r, dualtree=True)
+    counts_32 = bt_32.two_point_correlation(Y_32, r=r, dualtree=True)
+    assert_allclose(counts_64, counts_32)
+
+
+def get_dataset_for_binary_tree(random_seed, features=3):
+    rng = np.random.RandomState(random_seed)
+    _X = rng.rand(100, features)
+    _Y = rng.rand(5, features)
+
+    X_64 = _X.astype(dtype=np.float64, copy=False)
+    Y_64 = _Y.astype(dtype=np.float64, copy=False)
+
+    X_32 = _X.astype(dtype=np.float32, copy=False)
+    Y_32 = _Y.astype(dtype=np.float32, copy=False)
+
+    return X_64, X_32, Y_64, Y_32
diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py
index 1aee28cc36bd0..749601baaf66f 100644
--- a/sklearn/neighbors/tests/test_kd_tree.py
+++ b/sklearn/neighbors/tests/test_kd_tree.py
@@ -1,30 +1,100 @@
 import numpy as np
 import pytest
+from numpy.testing import assert_allclose, assert_equal
 
-from sklearn.neighbors._kd_tree import KDTree
+from sklearn.neighbors._kd_tree import KDTree, KDTree32, KDTree64
+from sklearn.neighbors.tests.test_ball_tree import get_dataset_for_binary_tree
 from sklearn.utils.parallel import Parallel, delayed
 
 DIMENSION = 3
 
 METRICS = {"euclidean": {}, "manhattan": {}, "chebyshev": {}, "minkowski": dict(p=3)}
 
+KD_TREE_CLASSES = [
+    KDTree64,
+    KDTree32,
+]
 
-def test_array_object_type():
+
+def test_KDTree_is_KDTree64_subclass():
+    assert issubclass(KDTree, KDTree64)
+
+
+@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES)
+def test_array_object_type(BinarySearchTree):
     """Check that we do not accept object dtype array."""
     X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
     with pytest.raises(ValueError, match="setting an array element with a sequence"):
-        KDTree(X)
+        BinarySearchTree(X)
 
 
-def test_kdtree_picklable_with_joblib():
+@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES)
+def test_kdtree_picklable_with_joblib(BinarySearchTree):
     """Make sure that KDTree queries work when joblib memmaps.
 
     Non-regression test for #21685 and #21228."""
     rng = np.random.RandomState(0)
     X = rng.random_sample((10, 3))
-    tree = KDTree(X, leaf_size=2)
+    tree = BinarySearchTree(X, leaf_size=2)
 
     # Call Parallel with max_nbytes=1 to trigger readonly memory mapping that
     # use to raise "ValueError: buffer source array is read-only" in a previous
     # version of the Cython code.
     Parallel(n_jobs=2, max_nbytes=1)(delayed(tree.query)(data) for data in 2 * [X])
+
+
+@pytest.mark.parametrize("metric", METRICS)
+def test_kd_tree_numerical_consistency(global_random_seed, metric):
+    # Results on float64 and float32 versions of a dataset must be
+    # numerically close.
+    X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(
+        random_seed=global_random_seed, features=50
+    )
+
+    metric_params = METRICS.get(metric, {})
+    kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params)
+    kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params)
+
+    # Test consistency with respect to the `query` method
+    k = 4
+    dist_64, ind_64 = kd_64.query(Y_64, k=k)
+    dist_32, ind_32 = kd_32.query(Y_32, k=k)
+    assert_allclose(dist_64, dist_32, rtol=1e-5)
+    assert_equal(ind_64, ind_32)
+    assert dist_64.dtype == np.float64
+    assert dist_32.dtype == np.float32
+
+    # Test consistency with respect to the `query_radius` method
+    r = 2.38
+    ind_64 = kd_64.query_radius(Y_64, r=r)
+    ind_32 = kd_32.query_radius(Y_32, r=r)
+    for _ind64, _ind32 in zip(ind_64, ind_32):
+        assert_equal(_ind64, _ind32)
+
+    # Test consistency with respect to the `query_radius` method
+    # with return distances being true
+    ind_64, dist_64 = kd_64.query_radius(Y_64, r=r, return_distance=True)
+    ind_32, dist_32 = kd_32.query_radius(Y_32, r=r, return_distance=True)
+    for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32):
+        assert_equal(_ind64, _ind32)
+        assert_allclose(_dist_64, _dist_32, rtol=1e-5)
+        assert _dist_64.dtype == np.float64
+        assert _dist_32.dtype == np.float32
+
+
+@pytest.mark.parametrize("metric", METRICS)
+def test_kernel_density_numerical_consistency(global_random_seed, metric):
+    # Test consistency with respect to the `kernel_density` method
+    X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed)
+
+    metric_params = METRICS.get(metric, {})
+    kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params)
+    kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params)
+
+    kernel = "gaussian"
+    h = 0.1
+    density64 = kd_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True)
+    density32 = kd_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True)
+    assert_allclose(density64, density32, rtol=1e-5)
+    assert density64.dtype == np.float64
+    assert density32.dtype == np.float32
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 405ac3a6d0847..35fc210bea7f3 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -24,6 +24,9 @@
 )
 from sklearn.base import clone
 from sklearn.exceptions import DataConversionWarning, EfficiencyWarning, NotFittedError
+from sklearn.metrics._dist_metrics import (
+    DistanceMetric,
+)
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.metrics.tests.test_dist_metrics import BOOL_METRICS
 from sklearn.metrics.tests.test_pairwise_distances_reduction import (
@@ -69,6 +72,7 @@
 COMMON_VALID_METRICS = sorted(
     set.intersection(*map(set, neighbors.VALID_METRICS.values()))
 )  # type: ignore
+
 P = (1, 2, 3, 4, np.inf)
 JOBLIB_BACKENDS = list(joblib.parallel.BACKENDS.keys())
 
@@ -76,6 +80,25 @@
 neighbors.kneighbors_graph = ignore_warnings(neighbors.kneighbors_graph)
 neighbors.radius_neighbors_graph = ignore_warnings(neighbors.radius_neighbors_graph)
 
+# A list containing metrics where the string specifies the use of the
+# DistanceMetric object directly (as resolved in _parse_metric)
+DISTANCE_METRIC_OBJS = ["DM_euclidean"]
+
+
+def _parse_metric(metric: str, dtype=None):
+    """
+    Helper function for properly building a type-specialized DistanceMetric instances.
+
+    Constructs a type-specialized DistanceMetric instance from a string
+    beginning with "DM_" while allowing a pass-through for other metric-specifying
+    strings. This is necessary since we wish to parameterize dtype independent of
+    metric, yet DistanceMetric requires it for construction.
+
+    """
+    if metric[:3] == "DM_":
+        return DistanceMetric.get_metric(metric[3:], dtype=dtype)
+    return metric
+
 
 def _generate_test_params_for(metric: str, n_features: int):
     """Return list of DistanceMetric kwargs for tests."""
@@ -129,7 +152,7 @@ def _weight_func(dist):
     ],
 )
 @pytest.mark.parametrize("query_is_train", [False, True])
-@pytest.mark.parametrize("metric", COMMON_VALID_METRICS)
+@pytest.mark.parametrize("metric", COMMON_VALID_METRICS + DISTANCE_METRIC_OBJS)  # type: ignore # noqa
 def test_unsupervised_kneighbors(
     global_dtype,
     n_samples,
@@ -143,6 +166,8 @@ def test_unsupervised_kneighbors(
     # on their common metrics, with and without returning
     # distances
 
+    metric = _parse_metric(metric, global_dtype)
+
     # Redefining the rng locally to use the same generated X
     local_rng = np.random.RandomState(0)
     X = local_rng.rand(n_samples, n_features).astype(global_dtype, copy=False)
@@ -157,6 +182,12 @@ def test_unsupervised_kneighbors(
     results = []
 
     for algorithm in ALGORITHMS:
+        if isinstance(metric, DistanceMetric) and global_dtype == np.float32:
+            if "tree" in algorithm:  # pragma: nocover
+                pytest.skip(
+                    "Neither KDTree nor BallTree support 32-bit distance metric"
+                    " objects."
+                )
         neigh = neighbors.NearestNeighbors(
             n_neighbors=n_neighbors, algorithm=algorithm, metric=metric
         )
@@ -206,7 +237,7 @@ def test_unsupervised_kneighbors(
         (1000, 5, 100),
     ],
 )
-@pytest.mark.parametrize("metric", COMMON_VALID_METRICS)
+@pytest.mark.parametrize("metric", COMMON_VALID_METRICS + DISTANCE_METRIC_OBJS)  # type: ignore # noqa
 @pytest.mark.parametrize("n_neighbors, radius", [(1, 100), (50, 500), (100, 1000)])
 @pytest.mark.parametrize(
     "NeighborsMixinSubclass",
@@ -230,6 +261,19 @@ def test_neigh_predictions_algorithm_agnosticity(
     # The different algorithms must return identical predictions results
     # on their common metrics.
 
+    metric = _parse_metric(metric, global_dtype)
+    if isinstance(metric, DistanceMetric):
+        if "Classifier" in NeighborsMixinSubclass.__name__:
+            pytest.skip(
+                "Metrics of type `DistanceMetric` are not yet supported for"
+                " classifiers."
+            )
+        if "Radius" in NeighborsMixinSubclass.__name__:
+            pytest.skip(
+                "Metrics of type `DistanceMetric` are not yet supported for"
+                " radius-neighbor estimators."
+            )
+
     # Redefining the rng locally to use the same generated X
     local_rng = np.random.RandomState(0)
     X = local_rng.rand(n_samples, n_features).astype(global_dtype, copy=False)
@@ -244,6 +288,12 @@ def test_neigh_predictions_algorithm_agnosticity(
     )
 
     for algorithm in ALGORITHMS:
+        if isinstance(metric, DistanceMetric) and global_dtype == np.float32:
+            if "tree" in algorithm:  # pragma: nocover
+                pytest.skip(
+                    "Neither KDTree nor BallTree support 32-bit distance metric"
+                    " objects."
+                )
         neigh = NeighborsMixinSubclass(parameter, algorithm=algorithm, metric=metric)
         neigh.fit(X, y)
 
@@ -985,15 +1035,26 @@ def test_query_equidistant_kth_nn(algorithm):
 
 @pytest.mark.parametrize(
     ["algorithm", "metric"],
-    [
-        ("ball_tree", "euclidean"),
-        ("kd_tree", "euclidean"),
+    list(
+        product(
+            ("kd_tree", "ball_tree", "brute"),
+            ("euclidean", *DISTANCE_METRIC_OBJS),
+        )
+    )
+    + [
         ("brute", "euclidean"),
         ("brute", "precomputed"),
     ],
 )
 def test_radius_neighbors_sort_results(algorithm, metric):
     # Test radius_neighbors[_graph] output when sort_result is True
+
+    metric = _parse_metric(metric, np.float64)
+    if isinstance(metric, DistanceMetric):
+        pytest.skip(
+            "Metrics of type `DistanceMetric` are not yet supported for radius-neighbor"
+            " estimators."
+        )
     n_samples = 10
     rng = np.random.RandomState(42)
     X = rng.random_sample((n_samples, 4))
@@ -1560,11 +1621,14 @@ def test_nearest_neighbors_validate_params():
             neighbors.VALID_METRICS["brute"]
         )
         - set(["pyfunc", *BOOL_METRICS])
-    ),
+    )
+    + DISTANCE_METRIC_OBJS,
 )
 def test_neighbors_metrics(
     global_dtype, metric, n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5
 ):
+    metric = _parse_metric(metric, global_dtype)
+
     # Test computing the neighbors for various metrics
     algorithms = ["brute", "ball_tree", "kd_tree"]
     X_train = rng.rand(n_samples, n_features).astype(global_dtype, copy=False)
@@ -1574,12 +1638,21 @@ def test_neighbors_metrics(
 
     for metric_params in metric_params_list:
         # Some metric (e.g. Weighted minkowski) are not supported by KDTree
-        exclude_kd_tree = metric not in neighbors.VALID_METRICS["kd_tree"] or (
-            "minkowski" in metric and "w" in metric_params
+        exclude_kd_tree = (
+            False
+            if isinstance(metric, DistanceMetric)
+            else metric not in neighbors.VALID_METRICS["kd_tree"]
+            or ("minkowski" in metric and "w" in metric_params)
         )
         results = {}
         p = metric_params.pop("p", 2)
         for algorithm in algorithms:
+            if isinstance(metric, DistanceMetric) and global_dtype == np.float32:
+                if "tree" in algorithm:  # pragma: nocover
+                    pytest.skip(
+                        "Neither KDTree nor BallTree support 32-bit distance metric"
+                        " objects."
+                    )
             neigh = neighbors.NearestNeighbors(
                 n_neighbors=n_neighbors,
                 algorithm=algorithm,
@@ -1684,10 +1757,14 @@ def custom_metric(x1, x2):
     assert_allclose(dist1, dist2)
 
 
-@pytest.mark.parametrize("metric", neighbors.VALID_METRICS["brute"])
+@pytest.mark.parametrize(
+    "metric", neighbors.VALID_METRICS["brute"] + DISTANCE_METRIC_OBJS
+)
 def test_valid_brute_metric_for_auto_algorithm(
     global_dtype, metric, n_samples=20, n_features=12
 ):
+    metric = _parse_metric(metric, global_dtype)
+
     X = rng.rand(n_samples, n_features).astype(global_dtype, copy=False)
     Xcsr = csr_matrix(X)
 
@@ -2207,3 +2284,22 @@ def test_predict_dataframe():
 
     knn = neighbors.KNeighborsClassifier(n_neighbors=2).fit(X, y)
     knn.predict(X)
+
+
+def test_nearest_neighbours_works_with_p_less_than_1():
+    """Check that NearestNeighbors works with :math:`p \\in (0,1)` when `algorithm`
+    is `"auto"` or `"brute"` regardless of the dtype of X.
+
+    Non-regression test for issue #26548
+    """
+    X = np.array([[1.0, 0.0], [0.0, 0.0], [0.0, 1.0]])
+    neigh = neighbors.NearestNeighbors(
+        n_neighbors=3, algorithm="brute", metric_params={"p": 0.5}
+    )
+    neigh.fit(X)
+
+    y = neigh.radius_neighbors(X[0].reshape(1, -1), radius=4, return_distance=False)
+    assert_allclose(y[0], [0, 1, 2])
+
+    y = neigh.kneighbors(X[0].reshape(1, -1), return_distance=False)
+    assert_allclose(y[0], [0, 1, 2])
diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py
index 590e72ab785d2..4d8bac12f7423 100644
--- a/sklearn/neighbors/tests/test_neighbors_tree.py
+++ b/sklearn/neighbors/tests/test_neighbors_tree.py
@@ -13,7 +13,7 @@
     kernel_norm,
 )
 from sklearn.neighbors._ball_tree import (
-    NeighborsHeap as NeighborsHeapBT,
+    NeighborsHeap64 as NeighborsHeapBT,
 )
 from sklearn.neighbors._ball_tree import (
     nodeheap_sort as nodeheap_sort_bt,
@@ -25,7 +25,7 @@
     KDTree,
 )
 from sklearn.neighbors._kd_tree import (
-    NeighborsHeap as NeighborsHeapKDT,
+    NeighborsHeap64 as NeighborsHeapKDT,
 )
 from sklearn.neighbors._kd_tree import (
     nodeheap_sort as nodeheap_sort_kdt,
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index d6ad0001ad257..d85196e879b45 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -334,9 +334,7 @@ def _log_message(self, step_idx):
 
     def _check_method_params(self, method, props, **kwargs):
         if _routing_enabled():
-            routed_params = process_routing(
-                self, method=method, other_params=props, **kwargs
-            )
+            routed_params = process_routing(self, method, **props, **kwargs)
             return routed_params
         else:
             fit_params_steps = Bunch(
@@ -586,7 +584,7 @@ def predict(self, X, **params):
             return self.steps[-1][1].predict(Xt, **params)
 
         # metadata routing enabled
-        routed_params = process_routing(self, "predict", other_params=params)
+        routed_params = process_routing(self, "predict", **params)
         for _, name, transform in self._iter(with_final=False):
             Xt = transform.transform(Xt, **routed_params[name].transform)
         return self.steps[-1][1].predict(Xt, **routed_params[self.steps[-1][0]].predict)
@@ -706,7 +704,7 @@ def predict_proba(self, X, **params):
             return self.steps[-1][1].predict_proba(Xt, **params)
 
         # metadata routing enabled
-        routed_params = process_routing(self, "predict_proba", other_params=params)
+        routed_params = process_routing(self, "predict_proba", **params)
         for _, name, transform in self._iter(with_final=False):
             Xt = transform.transform(Xt, **routed_params[name].transform)
         return self.steps[-1][1].predict_proba(
@@ -747,7 +745,7 @@ def decision_function(self, X, **params):
 
         # not branching here since params is only available if
         # enable_metadata_routing=True
-        routed_params = process_routing(self, "decision_function", other_params=params)
+        routed_params = process_routing(self, "decision_function", **params)
 
         Xt = X
         for _, name, transform in self._iter(with_final=False):
@@ -833,7 +831,7 @@ def predict_log_proba(self, X, **params):
             return self.steps[-1][1].predict_log_proba(Xt, **params)
 
         # metadata routing enabled
-        routed_params = process_routing(self, "predict_log_proba", other_params=params)
+        routed_params = process_routing(self, "predict_log_proba", **params)
         for _, name, transform in self._iter(with_final=False):
             Xt = transform.transform(Xt, **routed_params[name].transform)
         return self.steps[-1][1].predict_log_proba(
@@ -882,7 +880,7 @@ def transform(self, X, **params):
 
         # not branching here since params is only available if
         # enable_metadata_routing=True
-        routed_params = process_routing(self, "transform", other_params=params)
+        routed_params = process_routing(self, "transform", **params)
         Xt = X
         for _, name, transform in self._iter():
             Xt = transform.transform(Xt, **routed_params[name].transform)
@@ -925,7 +923,7 @@ def inverse_transform(self, Xt, **params):
 
         # we don't have to branch here, since params is only non-empty if
         # enable_metadata_routing=True.
-        routed_params = process_routing(self, "inverse_transform", other_params=params)
+        routed_params = process_routing(self, "inverse_transform", **params)
         reverse_iter = reversed(list(self._iter()))
         for _, name, transform in reverse_iter:
             Xt = transform.inverse_transform(
@@ -981,7 +979,7 @@ def score(self, X, y=None, sample_weight=None, **params):
 
         # metadata routing is enabled.
         routed_params = process_routing(
-            self, "score", sample_weight=sample_weight, other_params=params
+            self, "score", sample_weight=sample_weight, **params
         )
 
         Xt = X
@@ -1108,7 +1106,7 @@ def get_metadata_routing(self):
         router = MetadataRouter(owner=self.__class__.__name__)
 
         # first we add all steps except the last one
-        for _, name, trans in self._iter(with_final=False):
+        for _, name, trans in self._iter(with_final=False, filter_passthrough=True):
             method_mapping = MethodMapping()
             # fit, fit_predict, and fit_transform call fit_transform if it
             # exists, or else fit and transform
@@ -1142,7 +1140,7 @@ def get_metadata_routing(self):
             router.add(method_mapping=method_mapping, **{name: trans})
 
         final_name, final_est = self.steps[-1]
-        if not final_est:
+        if final_est is None or final_est == "passthrough":
             return router
 
         # then we add the last step
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 0a0447de95cd8..2c4ea4af450f2 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -14,6 +14,7 @@
 from ..utils._encode import _check_unknown, _encode, _get_counts, _unique
 from ..utils._mask import _get_mask
 from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
+from ..utils._set_output import _get_output_config
 from ..utils.validation import _check_feature_names_in, check_is_fitted
 
 __all__ = ["OneHotEncoder", "OrdinalEncoder"]
@@ -176,11 +177,11 @@ def _transform(
         warn_on_unknown=False,
         ignore_category_indices=None,
     ):
-        self._check_feature_names(X, reset=False)
-        self._check_n_features(X, reset=False)
         X_list, n_samples, n_features = self._check_X(
             X, force_all_finite=force_all_finite
         )
+        self._check_feature_names(X, reset=False)
+        self._check_n_features(X, reset=False)
 
         X_int = np.zeros((n_samples, n_features), dtype=int)
         X_mask = np.ones((n_samples, n_features), dtype=bool)
@@ -437,7 +438,7 @@ def _map_infrequent_categories(self, X_int, X_mask, ignore_category_indices):
             X_int[rows_to_update, i] = np.take(mapping, X_int[rows_to_update, i])
 
     def _more_tags(self):
-        return {"X_types": ["categorical"]}
+        return {"X_types": ["2darray", "categorical"], "allow_nan": True}
 
 
 class OneHotEncoder(_BaseEncoder):
@@ -1008,6 +1009,14 @@ def transform(self, X):
             returned.
         """
         check_is_fitted(self)
+        transform_output = _get_output_config("transform", estimator=self)["dense"]
+        if transform_output == "pandas" and self.sparse_output:
+            raise ValueError(
+                "Pandas output does not support sparse data. Set sparse_output=False to"
+                " output pandas DataFrames or disable pandas output via"
+                ' `ohe.set_output(transform="default").'
+            )
+
         # validation of X happens in _check_X called by _transform
         warn_on_unknown = self.drop is not None and self.handle_unknown in {
             "ignore",
@@ -1499,15 +1508,11 @@ def fit(self, X, y=None):
                 if infrequent is not None:
                     cardinalities[feature_idx] -= len(infrequent)
 
-        # stores the missing indices per category
-        self._missing_indices = {}
+        # missing values are not considered part of the cardinality
+        # when considering unknown categories or encoded_missing_value
         for cat_idx, categories_for_idx in enumerate(self.categories_):
-            for i, cat in enumerate(categories_for_idx):
+            for cat in categories_for_idx:
                 if is_scalar_nan(cat):
-                    self._missing_indices[cat_idx] = i
-
-                    # missing values are not considered part of the cardinality
-                    # when considering unknown categories or encoded_missing_value
                     cardinalities[cat_idx] -= 1
                     continue
 
diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
index d621c6c410153..3008710d3c3dc 100644
--- a/sklearn/preprocessing/_label.py
+++ b/sklearn/preprocessing/_label.py
@@ -31,7 +31,7 @@
 ]
 
 
-class LabelEncoder(TransformerMixin, BaseEstimator):
+class LabelEncoder(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
     """Encode target labels with value between 0 and n_classes-1.
 
     This transformer should be used to encode target values, *i.e.* `y`, and
@@ -56,8 +56,8 @@ class LabelEncoder(TransformerMixin, BaseEstimator):
     --------
     `LabelEncoder` can be used to normalize labels.
 
-    >>> from sklearn import preprocessing
-    >>> le = preprocessing.LabelEncoder()
+    >>> from sklearn.preprocessing import LabelEncoder
+    >>> le = LabelEncoder()
     >>> le.fit([1, 2, 2, 6])
     LabelEncoder()
     >>> le.classes_
@@ -70,7 +70,7 @@ class LabelEncoder(TransformerMixin, BaseEstimator):
     It can also be used to transform non-numerical labels (as long as they are
     hashable and comparable) to numerical labels.
 
-    >>> le = preprocessing.LabelEncoder()
+    >>> le = LabelEncoder()
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
     LabelEncoder()
     >>> list(le.classes_)
@@ -165,7 +165,7 @@ def _more_tags(self):
         return {"X_types": ["1dlabels"]}
 
 
-class LabelBinarizer(TransformerMixin, BaseEstimator):
+class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
     """Binarize labels in a one-vs-all fashion.
 
     Several regression and binary classification algorithms are
@@ -221,8 +221,8 @@ class LabelBinarizer(TransformerMixin, BaseEstimator):
 
     Examples
     --------
-    >>> from sklearn import preprocessing
-    >>> lb = preprocessing.LabelBinarizer()
+    >>> from sklearn.preprocessing import LabelBinarizer
+    >>> lb = LabelBinarizer()
     >>> lb.fit([1, 2, 6, 4, 2])
     LabelBinarizer()
     >>> lb.classes_
@@ -233,7 +233,7 @@ class LabelBinarizer(TransformerMixin, BaseEstimator):
 
     Binary targets transform to a column vector
 
-    >>> lb = preprocessing.LabelBinarizer()
+    >>> lb = LabelBinarizer()
     >>> lb.fit_transform(['yes', 'no', 'no', 'yes'])
     array([[1],
            [0],
@@ -685,7 +685,7 @@ def _inverse_binarize_thresholding(y, output_type, classes, threshold):
         raise ValueError("{0} format is not supported".format(output_type))
 
 
-class MultiLabelBinarizer(TransformerMixin, BaseEstimator):
+class MultiLabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
     """Transform between iterable of iterables and a multilabel format.
 
     Although a list of sets or tuples is a very intuitive format for multilabel
diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py
index ea2f3b202bac4..81afeb6a8bd43 100644
--- a/sklearn/preprocessing/_target_encoder.py
+++ b/sklearn/preprocessing/_target_encoder.py
@@ -27,8 +27,8 @@ class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
 
     .. note::
         `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
-        cross fitting scheme is used in `fit_transform` for encoding. See the
-        :ref:`User Guide <target_encoder>` for details.
+        :term:`cross fitting` scheme is used in `fit_transform` for encoding.
+        See the :ref:`User Guide <target_encoder>` for details.
 
     .. versionadded:: 1.3
 
@@ -68,7 +68,7 @@ class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
         If `"auto"`, then `smooth` is set to an empirical Bayes estimate.
 
     cv : int, default=5
-        Determines the number of folds in the cross fitting strategy used in
+        Determines the number of folds in the :term:`cross fitting` strategy used in
         :meth:`fit_transform`. For classification targets, `StratifiedKFold` is used
         and for continuous targets, `KFold` is used.
 
@@ -204,8 +204,8 @@ def fit_transform(self, X, y):
 
         .. note::
             `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
-            cross fitting scheme is used in `fit_transform` for encoding. See the
-            :ref:`User Guide <target_encoder>`. for details.
+            :term:`cross fitting` scheme is used in `fit_transform` for encoding.
+            See the :ref:`User Guide <target_encoder>`. for details.
 
         Parameters
         ----------
@@ -260,8 +260,8 @@ def transform(self, X):
 
         .. note::
             `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
-            cross fitting scheme is used in `fit_transform` for encoding. See the
-            :ref:`User Guide <target_encoder>`. for details.
+            :term:`cross fitting` scheme is used in `fit_transform` for encoding.
+            See the :ref:`User Guide <target_encoder>`. for details.
 
         Parameters
         ----------
@@ -273,14 +273,14 @@ def transform(self, X):
         X_trans : ndarray of shape (n_samples, n_features)
             Transformed input.
         """
-        X_ordinal, X_valid = self._transform(
+        X_ordinal, X_known_mask = self._transform(
             X, handle_unknown="ignore", force_all_finite="allow-nan"
         )
         X_out = np.empty_like(X_ordinal, dtype=np.float64)
         self._transform_X_ordinal(
             X_out,
             X_ordinal,
-            ~X_valid,
+            ~X_known_mask,
             slice(None),
             self.encodings_,
             self.target_mean_,
@@ -299,8 +299,9 @@ def _fit_encodings_all(self, X, y):
             inferred_type_of_target = type_of_target(y, input_name="y")
             if inferred_type_of_target not in accepted_target_types:
                 raise ValueError(
-                    f"Target type was inferred to be {inferred_type_of_target!r}. Only"
-                    f" {accepted_target_types} are supported."
+                    "Unknown label type: Target type was inferred to be "
+                    f"{inferred_type_of_target!r}. Only {accepted_target_types} are "
+                    "supported."
                 )
             self.target_type_ = inferred_type_of_target
         else:
@@ -343,4 +344,13 @@ def _transform_X_ordinal(
             X_out[X_unknown_mask[:, f_idx], f_idx] = y_mean
 
     def _more_tags(self):
-        return {"requires_y": True}
+        return {
+            "requires_y": True,
+            # TargetEncoder is a special case where a transformer uses `y` but
+            # only accept binary classification and regression targets. For the
+            # purpose of common tests we use `binary_only` tag to eliminate the
+            # multiclass tests. TODO: remove this special case when multiclass
+            # support is added to TargetEncoder. xref:
+            # https://github.com/scikit-learn/scikit-learn/pull/26674
+            "binary_only": True,
+        }
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index ca809dd513cf3..9ba041c90f5de 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -1588,6 +1588,26 @@ def test_ohe_drop_first_explicit_categories(handle_unknown):
     assert_allclose(X_trans, X_expected)
 
 
+def test_ohe_more_informative_error_message():
+    """Raise informative error message when pandas output and sparse_output=True."""
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame({"a": [1, 2, 3], "b": ["z", "b", "b"]}, columns=["a", "b"])
+
+    ohe = OneHotEncoder(sparse_output=True)
+    ohe.set_output(transform="pandas")
+
+    msg = (
+        "Pandas output does not support sparse data. Set "
+        "sparse_output=False to output pandas DataFrames or disable pandas output"
+    )
+    with pytest.raises(ValueError, match=msg):
+        ohe.fit_transform(df)
+
+    ohe.fit(df)
+    with pytest.raises(ValueError, match=msg):
+        ohe.transform(df)
+
+
 def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype():
     """Test ordinal encoder with nan passthrough fails when dtype=np.int32."""
 
diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py
index fa19171503a1d..7c4bb01535dca 100644
--- a/sklearn/preprocessing/tests/test_function_transformer.py
+++ b/sklearn/preprocessing/tests/test_function_transformer.py
@@ -6,7 +6,6 @@
 
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import FunctionTransformer
-from sklearn.utils import _safe_indexing
 from sklearn.utils._testing import (
     _convert_container,
     assert_allclose_dense_sparse,
@@ -196,9 +195,7 @@ def test_function_transformer_raise_error_with_mixed_dtype(X_type):
     data = _convert_container(data, X_type, columns_name=["value"], dtype=dtype)
 
     def func(X):
-        return np.array(
-            [mapping[_safe_indexing(X, i)] for i in range(X.size)], dtype=object
-        )
+        return np.array([mapping[X[i]] for i in range(X.size)], dtype=object)
 
     def inverse_func(X):
         return _convert_container(
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 7d413063968e4..633a386c75951 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -672,3 +672,17 @@ def test_nan_label_encoder():
 
     y_trans = le.transform([np.nan])
     assert_array_equal(y_trans, [2])
+
+
+@pytest.mark.parametrize(
+    "encoder", [LabelEncoder(), LabelBinarizer(), MultiLabelBinarizer()]
+)
+def test_label_encoders_do_not_have_set_output(encoder):
+    """Check that label encoders do not define set_output and work with y as a kwarg.
+
+    Non-regression test for #26854.
+    """
+    assert not hasattr(encoder, "set_output")
+    y_encoded_with_kwarg = encoder.fit_transform(y=["a", "b", "c"])
+    y_encoded_positional = encoder.fit_transform(["a", "b", "c"])
+    assert_array_equal(y_encoded_with_kwarg, y_encoded_positional)
diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py
index 2fc5e04b5df83..eb126ec77e526 100644
--- a/sklearn/svm/_base.py
+++ b/sklearn/svm/_base.py
@@ -825,7 +825,7 @@ def predict(self, X):
     def _check_proba(self):
         if not self.probability:
             raise AttributeError(
-                "predict_proba is not available when  probability=False"
+                "predict_proba is not available when probability=False"
             )
         if self._impl not in ("c_svc", "nu_svc"):
             raise AttributeError("predict_proba only implemented for SVC and NuSVC")
@@ -835,7 +835,7 @@ def _check_proba(self):
     def predict_proba(self, X):
         """Compute probabilities of possible outcomes for samples in X.
 
-        The model need to have probability information computed at training
+        The model needs to have probability information computed at training
         time: fit with attribute `probability` set to True.
 
         Parameters
@@ -1095,18 +1095,26 @@ def _fit_liblinear(
         Target vector relative to X
 
     C : float
-        Inverse of cross-validation parameter. Lower the C, the more
+        Inverse of cross-validation parameter. The lower the C, the higher
         the penalization.
 
     fit_intercept : bool
-        Whether or not to fit the intercept, that is to add a intercept
-        term to the decision function.
+        Whether or not to fit an intercept. If set to True, the feature vector
+        is extended to include an intercept term: ``[x_1, ..., x_n, 1]``, where
+        1 corresponds to the intercept. If set to False, no intercept will be
+        used in calculations (i.e. data is expected to be already centered).
 
     intercept_scaling : float
-        LibLinear internally penalizes the intercept and this term is subject
-        to regularization just like the other terms of the feature vector.
-        In order to avoid this, one should increase the intercept_scaling.
-        such that the feature vector becomes [x, intercept_scaling].
+        Liblinear internally penalizes the intercept, treating it like any
+        other term in the feature vector. To reduce the impact of the
+        regularization on the intercept, the `intercept_scaling` parameter can
+        be set to a value greater than 1; the higher the value of
+        `intercept_scaling`, the lower the impact of regularization on it.
+        Then, the weights become `[w_x_1, ..., w_x_n,
+        w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent
+        the feature weights and the intercept weight is scaled by
+        `intercept_scaling`. This scaling allows the intercept term to have a
+        different regularization behavior compared to the other features.
 
     class_weight : dict or 'balanced', default=None
         Weights associated with classes in the form ``{class_label: weight}``.
diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index 7a54c02201ccb..dfa48b4937147 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -49,6 +49,10 @@ class LinearSVC(LinearClassifierMixin, SparseCoefMixin, BaseEstimator):
     penalties and loss functions and should scale better to large numbers of
     samples.
 
+    The main differences between :class:`~sklearn.svm.LinearSVC` and
+    :class:`~sklearn.svm.SVC` lie in the loss function used by default, and in
+    the handling of intercept regularization between those two implementations.
+
     This class supports both dense and sparse input and the multiclass support
     is handled according to a one-vs-the-rest scheme.
 
@@ -99,20 +103,26 @@ class LinearSVC(LinearClassifierMixin, SparseCoefMixin, BaseEstimator):
         will be ignored.
 
     fit_intercept : bool, default=True
-        Whether to calculate the intercept for this model. If set
-        to false, no intercept will be used in calculations
-        (i.e. data is expected to be already centered).
+        Whether or not to fit an intercept. If set to True, the feature vector
+        is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where
+        1 corresponds to the intercept. If set to False, no intercept will be
+        used in calculations (i.e. data is expected to be already centered).
 
     intercept_scaling : float, default=1.0
-        When self.fit_intercept is True, instance vector x becomes
-        ``[x, self.intercept_scaling]``,
-        i.e. a "synthetic" feature with constant value equals to
-        intercept_scaling is appended to the instance vector.
-        The intercept becomes intercept_scaling * synthetic feature weight
-        Note! the synthetic feature weight is subject to l1/l2 regularization
-        as all other features.
-        To lessen the effect of regularization on synthetic feature weight
-        (and therefore on the intercept) intercept_scaling has to be increased.
+        When `fit_intercept` is True, the instance vector x becomes ``[x_1,
+        ..., x_n, intercept_scaling]``, i.e. a "synthetic" feature with a
+        constant value equal to `intercept_scaling` is appended to the instance
+        vector. The intercept becomes intercept_scaling * synthetic feature
+        weight. Note that liblinear internally penalizes the intercept,
+        treating it like any other term in the feature vector. To reduce the
+        impact of the regularization on the intercept, the `intercept_scaling`
+        parameter can be set to a value greater than 1; the higher the value of
+        `intercept_scaling`, the lower the impact of regularization on it.
+        Then, the weights become `[w_x_1, ..., w_x_n,
+        w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent
+        the feature weights and the intercept weight is scaled by
+        `intercept_scaling`. This scaling allows the intercept term to have a
+        different regularization behavior compared to the other features.
 
     class_weight : dict or 'balanced', default=None
         Set the parameter C of class i to ``class_weight[i]*C`` for
@@ -362,6 +372,10 @@ class LinearSVR(RegressorMixin, LinearModel):
     penalties and loss functions and should scale better to large numbers of
     samples.
 
+    The main differences between :class:`~sklearn.svm.LinearSVR` and
+    :class:`~sklearn.svm.SVR` lie in the loss function used by default, and in
+    the handling of intercept regularization between those two implementations.
+
     This class supports both dense and sparse input.
 
     Read more in the :ref:`User Guide <svm_regression>`.
@@ -389,20 +403,26 @@ class LinearSVR(RegressorMixin, LinearModel):
         loss ('squared_epsilon_insensitive') is the L2 loss.
 
     fit_intercept : bool, default=True
-        Whether to calculate the intercept for this model. If set
-        to false, no intercept will be used in calculations
-        (i.e. data is expected to be already centered).
+        Whether or not to fit an intercept. If set to True, the feature vector
+        is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where
+        1 corresponds to the intercept. If set to False, no intercept will be
+        used in calculations (i.e. data is expected to be already centered).
 
     intercept_scaling : float, default=1.0
-        When self.fit_intercept is True, instance vector x becomes
-        [x, self.intercept_scaling],
-        i.e. a "synthetic" feature with constant value equals to
-        intercept_scaling is appended to the instance vector.
-        The intercept becomes intercept_scaling * synthetic feature weight
-        Note! the synthetic feature weight is subject to l1/l2 regularization
-        as all other features.
-        To lessen the effect of regularization on synthetic feature weight
-        (and therefore on the intercept) intercept_scaling has to be increased.
+        When `fit_intercept` is True, the instance vector x becomes `[x_1, ...,
+        x_n, intercept_scaling]`, i.e. a "synthetic" feature with a constant
+        value equal to `intercept_scaling` is appended to the instance vector.
+        The intercept becomes intercept_scaling * synthetic feature weight.
+        Note that liblinear internally penalizes the intercept, treating it
+        like any other term in the feature vector. To reduce the impact of the
+        regularization on the intercept, the `intercept_scaling` parameter can
+        be set to a value greater than 1; the higher the value of
+        `intercept_scaling`, the lower the impact of regularization on it.
+        Then, the weights become `[w_x_1, ..., w_x_n,
+        w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent
+        the feature weights and the intercept weight is scaled by
+        `intercept_scaling`. This scaling allows the intercept term to have a
+        different regularization behavior compared to the other features.
 
     dual : "auto" or bool, default=True
         Select the algorithm to either solve the dual or primal
@@ -462,8 +482,8 @@ class LinearSVR(RegressorMixin, LinearModel):
         same library as this class (liblinear).
 
     SVR : Implementation of Support Vector Machine regression using libsvm:
-        the kernel can be non-linear but its SMO algorithm does not
-        scale to large number of samples as LinearSVC does.
+        the kernel can be non-linear but its SMO algorithm does not scale to
+        large number of samples as :class:`~sklearn.svm.LinearSVR` does.
 
     sklearn.linear_model.SGDRegressor : SGDRegressor can optimize the same cost
         function as LinearSVR
@@ -774,7 +794,7 @@ class SVC(BaseSVC):
         Indices of support vectors.
 
     support_vectors_ : ndarray of shape (n_SV, n_features)
-        Support vectors.
+        Support vectors. An empty array if kernel is precomputed.
 
     n_support_ : ndarray of shape (n_classes,), dtype=int32
         Number of support vectors for each class.
diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py
new file mode 100644
index 0000000000000..59166e6687369
--- /dev/null
+++ b/sklearn/tests/metadata_routing_common.py
@@ -0,0 +1,407 @@
+from functools import partial
+
+import numpy as np
+
+from sklearn.base import (
+    BaseEstimator,
+    ClassifierMixin,
+    MetaEstimatorMixin,
+    RegressorMixin,
+    TransformerMixin,
+    clone,
+)
+from sklearn.metrics._scorer import _PredictScorer, mean_squared_error
+from sklearn.model_selection import BaseCrossValidator
+from sklearn.model_selection._split import GroupsConsumerMixin
+from sklearn.utils._metadata_requests import (
+    SIMPLE_METHODS,
+)
+from sklearn.utils.metadata_routing import (
+    MetadataRouter,
+    process_routing,
+)
+
+
+def record_metadata(obj, method, record_default=True, **kwargs):
+    """Utility function to store passed metadata to a method.
+
+    If record_default is False, kwargs whose values are "default" are skipped.
+    This is so that checks on keyword arguments whose default was not changed
+    are skipped.
+
+    """
+    if not hasattr(obj, "_records"):
+        obj._records = {}
+    if not record_default:
+        kwargs = {
+            key: val
+            for key, val in kwargs.items()
+            if not isinstance(val, str) or (val != "default")
+        }
+    obj._records[method] = kwargs
+
+
+def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs):
+    """Check whether the expected metadata is passed to the object's method.
+
+    Parameters
+    ----------
+    split_params : tuple, default=empty
+        specifies any parameters which are to be checked as being a subset
+        of the original values.
+
+    """
+    records = getattr(obj, "_records", dict()).get(method, dict())
+    assert set(kwargs.keys()) == set(records.keys())
+    for key, value in kwargs.items():
+        recorded_value = records[key]
+        # The following condition is used to check for any specified parameters
+        # being a subset of the original values
+        if key in split_params and recorded_value is not None:
+            assert np.isin(recorded_value, value).all()
+        else:
+            assert recorded_value is value
+
+
+record_metadata_not_default = partial(record_metadata, record_default=False)
+
+
+def assert_request_is_empty(metadata_request, exclude=None):
+    """Check if a metadata request dict is empty.
+
+    One can exclude a method or a list of methods from the check using the
+    ``exclude`` parameter.
+    """
+    if isinstance(metadata_request, MetadataRouter):
+        for _, route_mapping in metadata_request:
+            assert_request_is_empty(route_mapping.router)
+        return
+
+    exclude = [] if exclude is None else exclude
+    for method in SIMPLE_METHODS:
+        if method in exclude:
+            continue
+        mmr = getattr(metadata_request, method)
+        props = [
+            prop
+            for prop, alias in mmr.requests.items()
+            if isinstance(alias, str) or alias is not None
+        ]
+        assert not len(props)
+
+
+def assert_request_equal(request, dictionary):
+    for method, requests in dictionary.items():
+        mmr = getattr(request, method)
+        assert mmr.requests == requests
+
+    empty_methods = [method for method in SIMPLE_METHODS if method not in dictionary]
+    for method in empty_methods:
+        assert not len(getattr(request, method).requests)
+
+
+class _Registry(list):
+    # This list is used to get a reference to the sub-estimators, which are not
+    # necessarily stored on the metaestimator. We need to override __deepcopy__
+    # because the sub-estimators are probably cloned, which would result in a
+    # new copy of the list, but we need copy and deep copy both to return the
+    # same instance.
+    def __deepcopy__(self, memo):
+        return self
+
+    def __copy__(self):
+        return self
+
+
+class ConsumingRegressor(RegressorMixin, BaseEstimator):
+    """A regressor consuming metadata.
+
+    Parameters
+    ----------
+    registry : list, default=None
+        If a list, the estimator will append itself to the list in order to have
+        a reference to the estimator later on. Since that reference is not
+        required in all tests, registration can be skipped by leaving this value
+        as None.
+
+    """
+
+    def __init__(self, registry=None):
+        self.registry = registry
+
+    def partial_fit(self, X, y, sample_weight="default", metadata="default"):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(
+            self, "partial_fit", sample_weight=sample_weight, metadata=metadata
+        )
+        return self
+
+    def fit(self, X, y, sample_weight="default", metadata="default"):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(
+            self, "fit", sample_weight=sample_weight, metadata=metadata
+        )
+        return self
+
+    def predict(self, X, sample_weight="default", metadata="default"):
+        pass  # pragma: no cover
+
+        # when needed, uncomment the implementation
+        # if self.registry is not None:
+        #     self.registry.append(self)
+
+        # record_metadata_not_default(
+        #     self, "predict", sample_weight=sample_weight, metadata=metadata
+        # )
+        # return np.zeros(shape=(len(X),))
+
+
+class NonConsumingClassifier(ClassifierMixin, BaseEstimator):
+    """A classifier which accepts no metadata on any method."""
+
+    def __init__(self, registry=None):
+        self.registry = registry
+
+    def fit(self, X, y):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        self.classes_ = [0, 1]
+        return self
+
+    def predict(self, X):
+        return np.ones(len(X))  # pragma: no cover
+
+
+class ConsumingClassifier(ClassifierMixin, BaseEstimator):
+    """A classifier consuming metadata.
+
+    Parameters
+    ----------
+    registry : list, default=None
+        If a list, the estimator will append itself to the list in order to have
+        a reference to the estimator later on. Since that reference is not
+        required in all tests, registration can be skipped by leaving this value
+        as None.
+
+    """
+
+    def __init__(self, registry=None):
+        self.registry = registry
+
+    def partial_fit(self, X, y, sample_weight="default", metadata="default"):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(
+            self, "partial_fit", sample_weight=sample_weight, metadata=metadata
+        )
+        self.classes_ = [0, 1]
+        return self
+
+    def fit(self, X, y, sample_weight="default", metadata="default"):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(
+            self, "fit", sample_weight=sample_weight, metadata=metadata
+        )
+        self.classes_ = [0, 1]
+        return self
+
+    def predict(self, X, sample_weight="default", metadata="default"):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(
+            self, "predict", sample_weight=sample_weight, metadata=metadata
+        )
+        return np.zeros(shape=(len(X),))
+
+    def predict_proba(self, X, sample_weight="default", metadata="default"):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(
+            self, "predict_proba", sample_weight=sample_weight, metadata=metadata
+        )
+        return np.asarray([[0.0, 1.0]] * len(X))
+
+    def predict_log_proba(self, X, sample_weight="default", metadata="default"):
+        pass  # pragma: no cover
+
+        # when needed, uncomment the implementation
+        # if self.registry is not None:
+        #     self.registry.append(self)
+
+        # record_metadata_not_default(
+        #     self, "predict_log_proba", sample_weight=sample_weight, metadata=metadata
+        # )
+        # return np.zeros(shape=(len(X), 2))
+
+
+class ConsumingTransformer(TransformerMixin, BaseEstimator):
+    """A transformer which accepts metadata on fit and transform.
+
+    Parameters
+    ----------
+    registry : list, default=None
+        If a list, the estimator will append itself to the list in order to have
+        a reference to the estimator later on. Since that reference is not
+        required in all tests, registration can be skipped by leaving this value
+        as None.
+    """
+
+    def __init__(self, registry=None):
+        self.registry = registry
+
+    def fit(self, X, y=None, sample_weight=None, metadata=None):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(
+            self, "fit", sample_weight=sample_weight, metadata=metadata
+        )
+        return self
+
+    def transform(self, X, sample_weight=None):
+        record_metadata(self, "transform", sample_weight=sample_weight)
+        return X
+
+
+class ConsumingScorer(_PredictScorer):
+    def __init__(self, registry=None):
+        super().__init__(score_func=mean_squared_error, sign=1, kwargs={})
+        self.registry = registry
+
+    def _score(self, method_caller, clf, X, y, **kwargs):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(self, "score", **kwargs)
+
+        sample_weight = kwargs.get("sample_weight", None)
+        return super()._score(method_caller, clf, X, y, sample_weight=sample_weight)
+
+
+class ConsumingSplitter(BaseCrossValidator, GroupsConsumerMixin):
+    def __init__(self, registry=None):
+        self.registry = registry
+
+    def split(self, X, y=None, groups="default", metadata="default"):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata_not_default(self, "split", groups=groups, metadata=metadata)
+
+        split_index = len(X) // 2
+        train_indices = list(range(0, split_index))
+        test_indices = list(range(split_index, len(X)))
+        yield test_indices, train_indices
+        yield train_indices, test_indices
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        pass  # pragma: no cover
+
+    def _iter_test_indices(self, X=None, y=None, groups=None):
+        split_index = len(X) // 2
+        train_indices = list(range(0, split_index))
+        test_indices = list(range(split_index, len(X)))
+        yield test_indices
+        yield train_indices
+
+
+class MetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator):
+    """A meta-regressor which is only a router."""
+
+    def __init__(self, estimator):
+        self.estimator = estimator
+
+    def fit(self, X, y, **fit_params):
+        params = process_routing(self, "fit", **fit_params)
+        self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit)
+
+    def get_metadata_routing(self):
+        router = MetadataRouter(owner=self.__class__.__name__).add(
+            estimator=self.estimator, method_mapping="one-to-one"
+        )
+        return router
+
+
+class WeightedMetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator):
+    """A meta-regressor which is also a consumer."""
+
+    def __init__(self, estimator, registry=None):
+        self.estimator = estimator
+        self.registry = registry
+
+    def fit(self, X, y, sample_weight=None, **fit_params):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata(self, "fit", sample_weight=sample_weight)
+        params = process_routing(self, "fit", sample_weight=sample_weight, **fit_params)
+        self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit)
+        return self
+
+    def predict(self, X, **predict_params):
+        params = process_routing(self, "predict", **predict_params)
+        return self.estimator_.predict(X, **params.estimator.predict)
+
+    def get_metadata_routing(self):
+        router = (
+            MetadataRouter(owner=self.__class__.__name__)
+            .add_self_request(self)
+            .add(estimator=self.estimator, method_mapping="one-to-one")
+        )
+        return router
+
+
+class WeightedMetaClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
+    """A meta-estimator which also consumes sample_weight itself in ``fit``."""
+
+    def __init__(self, estimator, registry=None):
+        self.estimator = estimator
+        self.registry = registry
+
+    def fit(self, X, y, sample_weight=None, **kwargs):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata(self, "fit", sample_weight=sample_weight)
+        params = process_routing(self, "fit", sample_weight=sample_weight, **kwargs)
+        self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit)
+        return self
+
+    def get_metadata_routing(self):
+        router = (
+            MetadataRouter(owner=self.__class__.__name__)
+            .add_self_request(self)
+            .add(estimator=self.estimator, method_mapping="fit")
+        )
+        return router
+
+
+class MetaTransformer(MetaEstimatorMixin, TransformerMixin, BaseEstimator):
+    """A simple meta-transformer."""
+
+    def __init__(self, transformer):
+        self.transformer = transformer
+
+    def fit(self, X, y=None, **fit_params):
+        params = process_routing(self, "fit", **fit_params)
+        self.transformer_ = clone(self.transformer).fit(X, y, **params.transformer.fit)
+        return self
+
+    def transform(self, X, y=None, **transform_params):
+        params = process_routing(self, "transform", **transform_params)
+        return self.transformer_.transform(X, **params.transformer.transform)
+
+    def get_metadata_routing(self):
+        return MetadataRouter(owner=self.__class__.__name__).add(
+            transformer=self.transformer, method_mapping="one-to-one"
+        )
diff --git a/sklearn/tests/test_metadata_routing.py b/sklearn/tests/test_metadata_routing.py
index 3fc6a9c337f47..50b6f912667ba 100644
--- a/sklearn/tests/test_metadata_routing.py
+++ b/sklearn/tests/test_metadata_routing.py
@@ -13,13 +13,23 @@
 from sklearn import config_context
 from sklearn.base import (
     BaseEstimator,
-    ClassifierMixin,
-    MetaEstimatorMixin,
-    RegressorMixin,
-    TransformerMixin,
     clone,
 )
 from sklearn.linear_model import LinearRegression
+from sklearn.tests.metadata_routing_common import (
+    ConsumingClassifier,
+    ConsumingRegressor,
+    ConsumingTransformer,
+    MetaRegressor,
+    MetaTransformer,
+    NonConsumingClassifier,
+    WeightedMetaClassifier,
+    WeightedMetaRegressor,
+    _Registry,
+    assert_request_equal,
+    assert_request_is_empty,
+    check_recorded_metadata,
+)
 from sklearn.utils import metadata_routing
 from sklearn.utils._metadata_requests import (
     COMPOSITE_METHODS,
@@ -56,209 +66,6 @@ def enable_slep006():
         yield
 
 
-def assert_request_is_empty(metadata_request, exclude=None):
-    """Check if a metadata request dict is empty.
-
-    One can exclude a method or a list of methods from the check using the
-    ``exclude`` parameter.
-    """
-    if isinstance(metadata_request, MetadataRouter):
-        for _, route_mapping in metadata_request:
-            assert_request_is_empty(route_mapping.router)
-        return
-
-    exclude = [] if exclude is None else exclude
-    for method in SIMPLE_METHODS:
-        if method in exclude:
-            continue
-        mmr = getattr(metadata_request, method)
-        props = [
-            prop
-            for prop, alias in mmr.requests.items()
-            if isinstance(alias, str) or alias is not None
-        ]
-        assert not len(props)
-
-
-def assert_request_equal(request, dictionary):
-    for method, requests in dictionary.items():
-        mmr = getattr(request, method)
-        assert mmr.requests == requests
-
-    empty_methods = [method for method in SIMPLE_METHODS if method not in dictionary]
-    for method in empty_methods:
-        assert not len(getattr(request, method).requests)
-
-
-def record_metadata(obj, method, record_default=True, **kwargs):
-    """Utility function to store passed metadata to a method.
-
-    If record_default is False, kwargs whose values are "default" are skipped.
-    This is so that checks on keyword arguments whose default was not changed
-    are skipped.
-
-    """
-    if not hasattr(obj, "_records"):
-        obj._records = {}
-    if not record_default:
-        kwargs = {
-            key: val
-            for key, val in kwargs.items()
-            if not isinstance(val, str) or (val != "default")
-        }
-    obj._records[method] = kwargs
-
-
-def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs):
-    """Check whether the expected metadata is passed to the object's method.
-
-    Parameters
-    ----------
-    split_params : tuple, default=empty
-        specifies any parameters which are to be checked as being a subset
-        of the original values.
-
-    """
-    records = getattr(obj, "_records", dict()).get(method, dict())
-    assert set(kwargs.keys()) == set(records.keys())
-    for key, value in kwargs.items():
-        recorded_value = records[key]
-        # The following condition is used to check for any specified parameters
-        # being a subset of the original values
-        if key in split_params and recorded_value is not None:
-            assert np.isin(recorded_value, value).all()
-        else:
-            assert recorded_value is value
-
-
-class MetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator):
-    """A meta-regressor which is only a router."""
-
-    def __init__(self, estimator):
-        self.estimator = estimator
-
-    def fit(self, X, y, **fit_params):
-        params = process_routing(self, "fit", fit_params)
-        self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit)
-
-    def get_metadata_routing(self):
-        router = MetadataRouter(owner=self.__class__.__name__).add(
-            estimator=self.estimator, method_mapping="one-to-one"
-        )
-        return router
-
-
-class RegressorMetadata(RegressorMixin, BaseEstimator):
-    """A regressor consuming a metadata."""
-
-    def fit(self, X, y, sample_weight=None):
-        record_metadata(self, "fit", sample_weight=sample_weight)
-        return self
-
-    def predict(self, X):
-        return np.zeros(shape=(len(X)))
-
-
-class WeightedMetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator):
-    """A meta-regressor which is also a consumer."""
-
-    def __init__(self, estimator):
-        self.estimator = estimator
-
-    def fit(self, X, y, sample_weight=None, **fit_params):
-        record_metadata(self, "fit", sample_weight=sample_weight)
-        params = process_routing(self, "fit", fit_params, sample_weight=sample_weight)
-        self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit)
-        return self
-
-    def predict(self, X, **predict_params):
-        params = process_routing(self, "predict", predict_params)
-        return self.estimator_.predict(X, **params.estimator.predict)
-
-    def get_metadata_routing(self):
-        router = (
-            MetadataRouter(owner=self.__class__.__name__)
-            .add_self_request(self)
-            .add(estimator=self.estimator, method_mapping="one-to-one")
-        )
-        return router
-
-
-class ClassifierNoMetadata(ClassifierMixin, BaseEstimator):
-    """An estimator which accepts no metadata on any method."""
-
-    def fit(self, X, y):
-        return self
-
-    def predict(self, X):
-        return np.ones(len(X))  # pragma: no cover
-
-
-class ClassifierFitMetadata(ClassifierMixin, BaseEstimator):
-    """An estimator accepting two metadata in its ``fit`` method."""
-
-    def fit(self, X, y, sample_weight=None, brand=None):
-        record_metadata(self, "fit", sample_weight=sample_weight, brand=brand)
-        return self
-
-    def predict(self, X):
-        return np.ones(len(X))  # pragma: no cover
-
-
-class SimpleMetaClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
-    """A meta-estimator which also consumes sample_weight itself in ``fit``."""
-
-    def __init__(self, estimator):
-        self.estimator = estimator
-
-    def fit(self, X, y, sample_weight=None, **kwargs):
-        record_metadata(self, "fit", sample_weight=sample_weight)
-        params = process_routing(self, "fit", kwargs, sample_weight=sample_weight)
-        self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit)
-        return self
-
-    def get_metadata_routing(self):
-        router = (
-            MetadataRouter(owner=self.__class__.__name__)
-            .add_self_request(self)
-            .add(estimator=self.estimator, method_mapping="fit")
-        )
-        return router
-
-
-class TransformerMetadata(TransformerMixin, BaseEstimator):
-    """A transformer which accepts metadata on fit and transform."""
-
-    def fit(self, X, y=None, brand=None, sample_weight=None):
-        record_metadata(self, "fit", brand=brand, sample_weight=sample_weight)
-        return self
-
-    def transform(self, X, sample_weight=None):
-        record_metadata(self, "transform", sample_weight=sample_weight)
-        return X
-
-
-class MetaTransformer(MetaEstimatorMixin, TransformerMixin, BaseEstimator):
-    """A simple meta-transformer."""
-
-    def __init__(self, transformer):
-        self.transformer = transformer
-
-    def fit(self, X, y=None, **fit_params):
-        params = process_routing(self, "fit", fit_params)
-        self.transformer_ = clone(self.transformer).fit(X, y, **params.transformer.fit)
-        return self
-
-    def transform(self, X, y=None, **transform_params):
-        params = process_routing(self, "transform", transform_params)
-        return self.transformer_.transform(X, **params.transformer.transform)
-
-    def get_metadata_routing(self):
-        return MetadataRouter(owner=self.__class__.__name__).add(
-            transformer=self.transformer, method_mapping="one-to-one"
-        )
-
-
 class SimplePipeline(BaseEstimator):
     """A very simple pipeline, assuming the last step is always a predictor."""
 
@@ -267,7 +74,7 @@ def __init__(self, steps):
 
     def fit(self, X, y, **fit_params):
         self.steps_ = []
-        params = process_routing(self, "fit", fit_params)
+        params = process_routing(self, "fit", **fit_params)
         X_transformed = X
         for i, step in enumerate(self.steps[:-1]):
             transformer = clone(step).fit(
@@ -286,7 +93,7 @@ def fit(self, X, y, **fit_params):
     def predict(self, X, **predict_params):
         check_is_fitted(self)
         X_transformed = X
-        params = process_routing(self, "predict", predict_params)
+        params = process_routing(self, "predict", **predict_params)
         for i, step in enumerate(self.steps_[:-1]):
             X_transformed = step.transform(X, **params.get(f"step_{i}").transform)
 
@@ -334,10 +141,27 @@ def test_assert_request_is_empty():
     assert_request_is_empty(
         MetadataRouter(owner="test")
         .add_self_request(WeightedMetaRegressor(estimator=None))
-        .add(method_mapping="fit", estimator=RegressorMetadata())
+        .add(method_mapping="fit", estimator=ConsumingRegressor())
     )
 
 
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        ConsumingClassifier(registry=_Registry()),
+        ConsumingRegressor(registry=_Registry()),
+        ConsumingTransformer(registry=_Registry()),
+        NonConsumingClassifier(registry=_Registry()),
+        WeightedMetaClassifier(estimator=ConsumingClassifier(), registry=_Registry()),
+        WeightedMetaRegressor(estimator=ConsumingRegressor(), registry=_Registry()),
+    ],
+)
+def test_estimator_puts_self_in_registry(estimator):
+    """Check that an estimator puts itself in the registry upon fit."""
+    estimator.fit(X, y)
+    assert estimator in estimator.registry
+
+
 @pytest.mark.parametrize(
     "val, res",
     [
@@ -383,90 +207,90 @@ class OddEstimator(BaseEstimator):
     assert odd_request.fit.requests == {"sample_weight": True}
 
     # check other test estimators
-    assert not len(get_routing_for_object(ClassifierNoMetadata()).fit.requests)
-    assert_request_is_empty(ClassifierNoMetadata().get_metadata_routing())
+    assert not len(get_routing_for_object(NonConsumingClassifier()).fit.requests)
+    assert_request_is_empty(NonConsumingClassifier().get_metadata_routing())
 
-    trs_request = get_routing_for_object(TransformerMetadata())
+    trs_request = get_routing_for_object(ConsumingTransformer())
     assert trs_request.fit.requests == {
         "sample_weight": None,
-        "brand": None,
+        "metadata": None,
     }
     assert trs_request.transform.requests == {
         "sample_weight": None,
     }
     assert_request_is_empty(trs_request)
 
-    est_request = get_routing_for_object(ClassifierFitMetadata())
+    est_request = get_routing_for_object(ConsumingClassifier())
     assert est_request.fit.requests == {
         "sample_weight": None,
-        "brand": None,
+        "metadata": None,
     }
     assert_request_is_empty(est_request)
 
 
 def test_process_routing_invalid_method():
     with pytest.raises(TypeError, match="Can only route and process input"):
-        process_routing(ClassifierFitMetadata(), "invalid_method", {})
+        process_routing(ConsumingClassifier(), "invalid_method", **{})
 
 
 def test_process_routing_invalid_object():
     class InvalidObject:
         pass
 
-    with pytest.raises(AttributeError, match="has not implemented the routing"):
-        process_routing(InvalidObject(), "fit", {})
+    with pytest.raises(AttributeError, match="either implement the routing method"):
+        process_routing(InvalidObject(), "fit", **{})
 
 
 def test_simple_metadata_routing():
     # Tests that metadata is properly routed
 
     # The underlying estimator doesn't accept or request metadata
-    clf = SimpleMetaClassifier(estimator=ClassifierNoMetadata())
+    clf = WeightedMetaClassifier(estimator=NonConsumingClassifier())
     clf.fit(X, y)
 
     # Meta-estimator consumes sample_weight, but doesn't forward it to the underlying
     # estimator
-    clf = SimpleMetaClassifier(estimator=ClassifierNoMetadata())
+    clf = WeightedMetaClassifier(estimator=NonConsumingClassifier())
     clf.fit(X, y, sample_weight=my_weights)
 
     # If the estimator accepts the metadata but doesn't explicitly say it doesn't
     # need it, there's an error
-    clf = SimpleMetaClassifier(estimator=ClassifierFitMetadata())
+    clf = WeightedMetaClassifier(estimator=ConsumingClassifier())
     err_message = (
         "[sample_weight] are passed but are not explicitly set as requested or"
-        " not for ClassifierFitMetadata.fit"
+        " not for ConsumingClassifier.fit"
     )
     with pytest.raises(ValueError, match=re.escape(err_message)):
         clf.fit(X, y, sample_weight=my_weights)
 
     # Explicitly saying the estimator doesn't need it, makes the error go away,
-    # because in this case `SimpleMetaClassifier` consumes `sample_weight`. If
+    # because in this case `WeightedMetaClassifier` consumes `sample_weight`. If
     # there was no consumer of sample_weight, passing it would result in an
     # error.
-    clf = SimpleMetaClassifier(
-        estimator=ClassifierFitMetadata().set_fit_request(sample_weight=False)
+    clf = WeightedMetaClassifier(
+        estimator=ConsumingClassifier().set_fit_request(sample_weight=False)
     )
-    # this doesn't raise since SimpleMetaClassifier itself is a consumer,
+    # this doesn't raise since WeightedMetaClassifier itself is a consumer,
     # and passing metadata to the consumer directly is fine regardless of its
     # metadata_request values.
     clf.fit(X, y, sample_weight=my_weights)
-    check_recorded_metadata(clf.estimator_, "fit", sample_weight=None, brand=None)
+    check_recorded_metadata(clf.estimator_, "fit")
 
     # Requesting a metadata will make the meta-estimator forward it correctly
-    clf = SimpleMetaClassifier(
-        estimator=ClassifierFitMetadata().set_fit_request(sample_weight=True)
+    clf = WeightedMetaClassifier(
+        estimator=ConsumingClassifier().set_fit_request(sample_weight=True)
     )
     clf.fit(X, y, sample_weight=my_weights)
-    check_recorded_metadata(clf.estimator_, "fit", sample_weight=my_weights, brand=None)
+    check_recorded_metadata(clf.estimator_, "fit", sample_weight=my_weights)
 
     # And requesting it with an alias
-    clf = SimpleMetaClassifier(
-        estimator=ClassifierFitMetadata().set_fit_request(
+    clf = WeightedMetaClassifier(
+        estimator=ConsumingClassifier().set_fit_request(
             sample_weight="alternative_weight"
         )
     )
     clf.fit(X, y, alternative_weight=my_weights)
-    check_recorded_metadata(clf.estimator_, "fit", sample_weight=my_weights, brand=None)
+    check_recorded_metadata(clf.estimator_, "fit", sample_weight=my_weights)
 
 
 def test_nested_routing():
@@ -474,23 +298,23 @@ def test_nested_routing():
     pipeline = SimplePipeline(
         [
             MetaTransformer(
-                transformer=TransformerMetadata()
-                .set_fit_request(brand=True, sample_weight=False)
+                transformer=ConsumingTransformer()
+                .set_fit_request(metadata=True, sample_weight=False)
                 .set_transform_request(sample_weight=True)
             ),
             WeightedMetaRegressor(
-                estimator=RegressorMetadata().set_fit_request(
-                    sample_weight="inner_weights"
-                )
+                estimator=ConsumingRegressor()
+                .set_fit_request(sample_weight="inner_weights", metadata=False)
+                .set_predict_request(sample_weight=False)
             ).set_fit_request(sample_weight="outer_weights"),
         ]
     )
     w1, w2, w3 = [1], [2], [3]
     pipeline.fit(
-        X, y, brand=my_groups, sample_weight=w1, outer_weights=w2, inner_weights=w3
+        X, y, metadata=my_groups, sample_weight=w1, outer_weights=w2, inner_weights=w3
     )
     check_recorded_metadata(
-        pipeline.steps_[0].transformer_, "fit", brand=my_groups, sample_weight=None
+        pipeline.steps_[0].transformer_, "fit", metadata=my_groups, sample_weight=None
     )
     check_recorded_metadata(
         pipeline.steps_[0].transformer_, "transform", sample_weight=w1
@@ -509,12 +333,12 @@ def test_nested_routing_conflict():
     pipeline = SimplePipeline(
         [
             MetaTransformer(
-                transformer=TransformerMetadata()
-                .set_fit_request(brand=True, sample_weight=False)
+                transformer=ConsumingTransformer()
+                .set_fit_request(metadata=True, sample_weight=False)
                 .set_transform_request(sample_weight=True)
             ),
             WeightedMetaRegressor(
-                estimator=RegressorMetadata().set_fit_request(sample_weight=True)
+                estimator=ConsumingRegressor().set_fit_request(sample_weight=True)
             ).set_fit_request(sample_weight="outer_weights"),
         ]
     )
@@ -530,13 +354,13 @@ def test_nested_routing_conflict():
             )
         ),
     ):
-        pipeline.fit(X, y, brand=my_groups, sample_weight=w1, outer_weights=w2)
+        pipeline.fit(X, y, metadata=my_groups, sample_weight=w1, outer_weights=w2)
 
 
 def test_invalid_metadata():
     # check that passing wrong metadata raises an error
     trs = MetaTransformer(
-        transformer=TransformerMetadata().set_transform_request(sample_weight=True)
+        transformer=ConsumingTransformer().set_transform_request(sample_weight=True)
     )
     with pytest.raises(
         TypeError,
@@ -546,7 +370,7 @@ def test_invalid_metadata():
 
     # passing a metadata which is not requested by any estimator should also raise
     trs = MetaTransformer(
-        transformer=TransformerMetadata().set_transform_request(sample_weight=False)
+        transformer=ConsumingTransformer().set_transform_request(sample_weight=False)
     )
     with pytest.raises(
         TypeError,
@@ -751,14 +575,14 @@ def test_metadata_router_consumes_method():
     cases = [
         (
             WeightedMetaRegressor(
-                estimator=RegressorMetadata().set_fit_request(sample_weight=True)
+                estimator=ConsumingRegressor().set_fit_request(sample_weight=True)
             ),
             {"sample_weight"},
             {"sample_weight"},
         ),
         (
             WeightedMetaRegressor(
-                estimator=RegressorMetadata().set_fit_request(
+                estimator=ConsumingRegressor().set_fit_request(
                     sample_weight="my_weights"
                 )
             ),
@@ -784,13 +608,13 @@ class WeightedMetaRegressorWarn(WeightedMetaRegressor):
 
 
 def test_estimator_warnings():
-    class RegressorMetadataWarn(RegressorMetadata):
+    class ConsumingRegressorWarn(ConsumingRegressor):
         __metadata_request__fit = {"sample_weight": metadata_routing.WARN}
 
     with pytest.warns(
         UserWarning, match="Support for .* has recently been added to this class"
     ):
-        MetaRegressor(estimator=RegressorMetadataWarn()).fit(
+        MetaRegressor(estimator=ConsumingRegressorWarn()).fit(
             X, y, sample_weight=my_weights
         )
 
@@ -811,12 +635,14 @@ class RegressorMetadataWarn(RegressorMetadata):
         (MethodMapping.from_str("score"), "[{'callee': 'score', 'caller': 'score'}]"),
         (
             MetadataRouter(owner="test").add(
-                method_mapping="predict", estimator=RegressorMetadata()
+                method_mapping="predict", estimator=ConsumingRegressor()
             ),
             (
-                "{'estimator': {'mapping': [{'callee': 'predict', 'caller': "
-                "'predict'}], 'router': {'fit': {'sample_weight': None}, "
-                "'score': {'sample_weight': None}}}}"
+                "{'estimator': {'mapping': [{'callee': 'predict', 'caller':"
+                " 'predict'}], 'router': {'fit': {'sample_weight': None, 'metadata':"
+                " None}, 'partial_fit': {'sample_weight': None, 'metadata': None},"
+                " 'predict': {'sample_weight': None, 'metadata': None}, 'score':"
+                " {'sample_weight': None}}}}"
             ),
         ),
     ],
@@ -857,7 +683,7 @@ def test_string_representations(obj, string):
             "Given `obj` is neither a `MetadataRequest` nor does it implement",
         ),
         (
-            ClassifierFitMetadata(),
+            ConsumingClassifier(),
             "set_fit_request",
             {"invalid": True},
             TypeError,
@@ -900,14 +726,14 @@ def test_metadatarouter_add_self_request():
     assert router._self_request is not request
 
     # one can add an estimator as self
-    est = RegressorMetadata().set_fit_request(sample_weight="my_weights")
+    est = ConsumingRegressor().set_fit_request(sample_weight="my_weights")
     router = MetadataRouter(owner="test").add_self_request(obj=est)
     assert str(router._self_request) == str(est.get_metadata_routing())
     assert router._self_request is not est.get_metadata_routing()
 
     # adding a consumer+router as self should only add the consumer part
     est = WeightedMetaRegressor(
-        estimator=RegressorMetadata().set_fit_request(sample_weight="nested_weights")
+        estimator=ConsumingRegressor().set_fit_request(sample_weight="nested_weights")
     )
     router = MetadataRouter(owner="test").add_self_request(obj=est)
     # _get_metadata_request() returns the consumer part of the requests
@@ -923,25 +749,27 @@ def test_metadata_routing_add():
     # adding one with a string `method_mapping`
     router = MetadataRouter(owner="test").add(
         method_mapping="fit",
-        est=RegressorMetadata().set_fit_request(sample_weight="weights"),
+        est=ConsumingRegressor().set_fit_request(sample_weight="weights"),
     )
     assert (
         str(router)
-        == "{'est': {'mapping': [{'callee': 'fit', 'caller': 'fit'}], "
-        "'router': {'fit': {'sample_weight': 'weights'}, 'score': "
-        "{'sample_weight': None}}}}"
+        == "{'est': {'mapping': [{'callee': 'fit', 'caller': 'fit'}], 'router': {'fit':"
+        " {'sample_weight': 'weights', 'metadata': None}, 'partial_fit':"
+        " {'sample_weight': None, 'metadata': None}, 'predict': {'sample_weight':"
+        " None, 'metadata': None}, 'score': {'sample_weight': None}}}}"
     )
 
     # adding one with an instance of MethodMapping
     router = MetadataRouter(owner="test").add(
         method_mapping=MethodMapping().add(callee="score", caller="fit"),
-        est=RegressorMetadata().set_score_request(sample_weight=True),
+        est=ConsumingRegressor().set_score_request(sample_weight=True),
     )
     assert (
         str(router)
-        == "{'est': {'mapping': [{'callee': 'score', 'caller': 'fit'}], "
-        "'router': {'fit': {'sample_weight': None}, 'score': "
-        "{'sample_weight': True}}}}"
+        == "{'est': {'mapping': [{'callee': 'score', 'caller': 'fit'}], 'router':"
+        " {'fit': {'sample_weight': None, 'metadata': None}, 'partial_fit':"
+        " {'sample_weight': None, 'metadata': None}, 'predict': {'sample_weight':"
+        " None, 'metadata': None}, 'score': {'sample_weight': True}}}}"
     )
 
 
@@ -949,13 +777,13 @@ def test_metadata_routing_get_param_names():
     router = (
         MetadataRouter(owner="test")
         .add_self_request(
-            WeightedMetaRegressor(estimator=RegressorMetadata()).set_fit_request(
+            WeightedMetaRegressor(estimator=ConsumingRegressor()).set_fit_request(
                 sample_weight="self_weights"
             )
         )
         .add(
             method_mapping="fit",
-            trs=TransformerMetadata().set_fit_request(
+            trs=ConsumingTransformer().set_fit_request(
                 sample_weight="transform_weights"
             ),
         )
@@ -963,24 +791,23 @@ def test_metadata_routing_get_param_names():
 
     assert (
         str(router)
-        == "{'$self_request': {'fit': {'sample_weight': 'self_weights'}, 'score': "
-        "{'sample_weight': None}}, 'trs': {'mapping': [{'callee': 'fit', "
-        "'caller': 'fit'}], 'router': {'fit': {'brand': None, "
-        "'sample_weight': 'transform_weights'}, 'transform': "
-        "{'sample_weight': None}}}}"
+        == "{'$self_request': {'fit': {'sample_weight': 'self_weights'}, 'score':"
+        " {'sample_weight': None}}, 'trs': {'mapping': [{'callee': 'fit', 'caller':"
+        " 'fit'}], 'router': {'fit': {'sample_weight': 'transform_weights',"
+        " 'metadata': None}, 'transform': {'sample_weight': None}}}}"
     )
 
     assert router._get_param_names(
         method="fit", return_alias=True, ignore_self_request=False
-    ) == {"transform_weights", "brand", "self_weights"}
+    ) == {"transform_weights", "metadata", "self_weights"}
     # return_alias=False will return original names for "self"
     assert router._get_param_names(
         method="fit", return_alias=False, ignore_self_request=False
-    ) == {"sample_weight", "brand", "transform_weights"}
+    ) == {"sample_weight", "metadata", "transform_weights"}
     # ignoring self would remove "sample_weight"
     assert router._get_param_names(
         method="fit", return_alias=False, ignore_self_request=True
-    ) == {"brand", "transform_weights"}
+    ) == {"metadata", "transform_weights"}
     # return_alias is ignored when ignore_self_request=True
     assert router._get_param_names(
         method="fit", return_alias=True, ignore_self_request=True
@@ -1138,9 +965,9 @@ def test_no_feature_flag_raises_error():
     """Test that when feature flag disabled, set_{method}_requests raises."""
     with config_context(enable_metadata_routing=False):
         with pytest.raises(RuntimeError, match="This method is only available"):
-            ClassifierFitMetadata().set_fit_request(sample_weight=True)
+            ConsumingClassifier().set_fit_request(sample_weight=True)
 
 
 def test_none_metadata_passed():
     """Test that passing None as metadata when not requested doesn't raise"""
-    MetaRegressor(estimator=RegressorMetadata()).fit(X, y, sample_weight=None)
+    MetaRegressor(estimator=ConsumingRegressor()).fit(X, y, sample_weight=None)
diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py
index 768a57c61dc52..4a548fe9f067f 100644
--- a/sklearn/tests/test_metaestimators_metadata_routing.py
+++ b/sklearn/tests/test_metaestimators_metadata_routing.py
@@ -1,28 +1,27 @@
 import copy
 import re
-from functools import partial
 
 import numpy as np
 import pytest
 
 from sklearn import config_context
-from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.exceptions import UnsetMetadataPassedError
 from sklearn.linear_model import LogisticRegressionCV
-from sklearn.metrics._scorer import _BaseScorer
-from sklearn.model_selection import BaseCrossValidator
-from sklearn.model_selection._split import GroupsConsumerMixin
 from sklearn.multioutput import (
     ClassifierChain,
     MultiOutputClassifier,
     MultiOutputRegressor,
     RegressorChain,
 )
-from sklearn.tests.test_metadata_routing import (
+from sklearn.tests.metadata_routing_common import (
+    ConsumingClassifier,
+    ConsumingRegressor,
+    ConsumingScorer,
+    ConsumingSplitter,
+    _Registry,
     assert_request_is_empty,
     check_recorded_metadata,
-    record_metadata,
 )
 from sklearn.utils.metadata_routing import MetadataRouter
 
@@ -43,179 +42,6 @@ def enable_slep006():
         yield
 
 
-record_metadata_not_default = partial(record_metadata, record_default=False)
-
-
-class _Registry(list):
-    # This list is used to get a reference to the sub-estimators, which are not
-    # necessarily stored on the metaestimator. We need to override __deepcopy__
-    # because the sub-estimators are probably cloned, which would result in a
-    # new copy of the list, but we need copy and deep copy both to return the
-    # same instance.
-    def __deepcopy__(self, memo):
-        return self
-
-    def __copy__(self):
-        return self
-
-
-class ConsumingRegressor(RegressorMixin, BaseEstimator):
-    """A regressor consuming metadata.
-
-    Parameters
-    ----------
-    registry : list, default=None
-        If a list, the estimator will append itself to the list in order to have
-        a reference to the estimator later on. Since that reference is not
-        required in all tests, registration can be skipped by leaving this value
-        as None.
-
-    """
-
-    def __init__(self, registry=None):
-        self.registry = registry
-
-    def partial_fit(self, X, y, sample_weight="default", metadata="default"):
-        if self.registry is not None:
-            self.registry.append(self)
-
-        record_metadata_not_default(
-            self, "partial_fit", sample_weight=sample_weight, metadata=metadata
-        )
-        return self
-
-    def fit(self, X, y, sample_weight="default", metadata="default"):
-        if self.registry is not None:
-            self.registry.append(self)
-
-        record_metadata_not_default(
-            self, "fit", sample_weight=sample_weight, metadata=metadata
-        )
-        return self
-
-    def predict(self, X, sample_weight="default", metadata="default"):
-        pass  # pragma: no cover
-
-        # when needed, uncomment the implementation
-        # if self.registry is not None:
-        #     self.registry.append(self)
-
-        # record_metadata_not_default(
-        #     self, "predict", sample_weight=sample_weight, metadata=metadata
-        # )
-        # return np.zeros(shape=(len(X),))
-
-
-class ConsumingClassifier(ClassifierMixin, BaseEstimator):
-    """A classifier consuming metadata.
-
-    Parameters
-    ----------
-    registry : list, default=None
-        If a list, the estimator will append itself to the list in order to have
-        a reference to the estimator later on. Since that reference is not
-        required in all tests, registration can be skipped by leaving this value
-        as None.
-
-    """
-
-    def __init__(self, registry=None):
-        self.registry = registry
-
-    def partial_fit(self, X, y, sample_weight="default", metadata="default"):
-        if self.registry is not None:
-            self.registry.append(self)
-
-        record_metadata_not_default(
-            self, "partial_fit", sample_weight=sample_weight, metadata=metadata
-        )
-        self.classes_ = [0, 1]
-        return self
-
-    def fit(self, X, y, sample_weight="default", metadata="default"):
-        if self.registry is not None:
-            self.registry.append(self)
-
-        record_metadata_not_default(
-            self, "fit", sample_weight=sample_weight, metadata=metadata
-        )
-        self.classes_ = [0, 1]
-        return self
-
-    def predict(self, X, sample_weight="default", metadata="default"):
-        pass  # pragma: no cover
-
-        # when needed, uncomment the implementation
-        # if self.registry is not None:
-        #     self.registry.append(self)
-
-        # record_metadata_not_default(
-        #     self, "predict", sample_weight=sample_weight, metadata=metadata
-        # )
-        # return np.zeros(shape=(len(X),))
-
-    def predict_proba(self, X, sample_weight="default", metadata="default"):
-        if self.registry is not None:
-            self.registry.append(self)
-
-        record_metadata_not_default(
-            self, "predict_proba", sample_weight=sample_weight, metadata=metadata
-        )
-        return np.asarray([[0.0, 1.0]] * len(X))
-
-    def predict_log_proba(self, X, sample_weight="default", metadata="default"):
-        pass  # pragma: no cover
-
-        # when needed, uncomment the implementation
-        # if self.registry is not None:
-        #     self.registry.append(self)
-
-        # record_metadata_not_default(
-        #     self, "predict_log_proba", sample_weight=sample_weight, metadata=metadata
-        # )
-        # return np.zeros(shape=(len(X), 2))
-
-
-class ConsumingScorer(_BaseScorer):
-    def __init__(self, registry=None):
-        super().__init__(score_func="test", sign=1, kwargs={})
-        self.registry = registry
-
-    def __call__(
-        self, estimator, X, y_true, sample_weight="default", metadata="default"
-    ):
-        if self.registry is not None:
-            self.registry.append(self)
-
-        record_metadata_not_default(
-            self, "score", sample_weight=sample_weight, metadata=metadata
-        )
-
-        return 0.0
-
-
-class ConsumingSplitter(BaseCrossValidator, GroupsConsumerMixin):
-    def __init__(self, registry=None):
-        self.registry = registry
-
-    def split(self, X, y=None, groups="default"):
-        if self.registry is not None:
-            self.registry.append(self)
-
-        record_metadata_not_default(self, "split", groups=groups)
-
-        split_index = len(X) - 10
-        train_indices = range(0, split_index)
-        test_indices = range(split_index, len(X))
-        yield test_indices, train_indices
-
-    def get_n_splits(self, X=None, y=None, groups=None):
-        pass  # pragma: no cover
-
-    def _iter_test_indices(self, X=None, y=None, groups=None):
-        pass  # pragma: no cover
-
-
 METAESTIMATORS: list = [
     {
         "metaestimator": MultiOutputRegressor,
@@ -279,7 +105,7 @@ def _iter_test_indices(self, X=None, y=None, groups=None):
 # ids used for pytest fixture
 METAESTIMATOR_IDS = [str(row["metaestimator"].__name__) for row in METAESTIMATORS]
 
-CV_SCORERS = [
+CV_SCORERS: list = [
     {
         "cv_estimator": LogisticRegressionCV,
         "scorer_name": "scoring",
@@ -287,7 +113,7 @@ def _iter_test_indices(self, X=None, y=None, groups=None):
     },
 ]
 
-CV_SPLITTERS = [
+CV_SPLITTERS: list = [
     {
         "cv_estimator": LogisticRegressionCV,
         "splitter_name": "cv",
@@ -295,6 +121,10 @@ def _iter_test_indices(self, X=None, y=None, groups=None):
     }
 ]
 
+# IDs used by pytest to get meaningful verbose messages when running the tests
+CV_SCORER_IDS = [x["cv_estimator"].__name__ for x in CV_SCORERS]
+CV_SPLITTER_IDS = [x["cv_estimator"].__name__ for x in CV_SPLITTERS]
+
 
 def test_registry_copy():
     # test that _Registry is not copied into a new instance.
@@ -390,7 +220,7 @@ def set_request(estimator, method_name):
                     check_recorded_metadata(estimator, method_name, **kwargs)
 
 
-@pytest.mark.parametrize("cv_scorer", CV_SCORERS)
+@pytest.mark.parametrize("cv_scorer", CV_SCORERS, ids=CV_SCORER_IDS)
 def test_metadata_is_routed_correctly_to_scorer(cv_scorer):
     """Test that any requested metadata is correctly routed to the underlying
     scorers in CV estimators.
@@ -406,6 +236,8 @@ def test_metadata_is_routed_correctly_to_scorer(cv_scorer):
         instance = cls(**{scorer_name: scorer})
         method = getattr(instance, method_name)
         kwargs = {"sample_weight": sample_weight}
+        if "fit" not in method_name:  # instance needs to be fitted first
+            instance.fit(X, y)
         method(X, y, **kwargs)
         for _scorer in registry:
             check_recorded_metadata(
@@ -416,7 +248,7 @@ def test_metadata_is_routed_correctly_to_scorer(cv_scorer):
             )
 
 
-@pytest.mark.parametrize("cv_splitter", CV_SPLITTERS)
+@pytest.mark.parametrize("cv_splitter", CV_SPLITTERS, ids=CV_SPLITTER_IDS)
 def test_metadata_is_routed_correctly_to_splitter(cv_splitter):
     """Test that any requested metadata is correctly routed to the underlying
     splitters in CV estimators.
diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
index c4e565e13aae1..793e5793aec3f 100644
--- a/sklearn/tests/test_pipeline.py
+++ b/sklearn/tests/test_pipeline.py
@@ -17,7 +17,11 @@
 from sklearn.datasets import load_iris
 from sklearn.decomposition import PCA, TruncatedSVD
 from sklearn.dummy import DummyRegressor
-from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.ensemble import (
+    HistGradientBoostingClassifier,
+    RandomForestClassifier,
+    RandomTreesEmbedding,
+)
 from sklearn.exceptions import NotFittedError
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectKBest, f_classif
@@ -27,7 +31,7 @@
 from sklearn.model_selection import train_test_split
 from sklearn.neighbors import LocalOutlierFactor
 from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import FunctionTransformer, StandardScaler
 from sklearn.svm import SVC
 from sklearn.utils._metadata_requests import COMPOSITE_METHODS, METHODS
 from sklearn.utils._testing import (
@@ -1828,5 +1832,26 @@ def test_routing_passed_metadata_not_supported(method):
         getattr(pipe, method)([[1]], sample_weight=[1], prop="a")
 
 
+@pytest.mark.usefixtures("enable_slep006")
+def test_pipeline_with_estimator_with_len():
+    """Test that pipeline works with estimators that have a `__len__` method."""
+    pipe = Pipeline(
+        [("trs", RandomTreesEmbedding()), ("estimator", RandomForestClassifier())]
+    )
+    pipe.fit([[1]], [1])
+    pipe.predict([[1]])
+
+
+@pytest.mark.usefixtures("enable_slep006")
+@pytest.mark.parametrize("last_step", [None, "passthrough"])
+def test_pipeline_with_no_last_step(last_step):
+    """Test that the pipeline works when there is not last step.
+
+    It should just ignore and pass through the data on transform.
+    """
+    pipe = Pipeline([("trs", FunctionTransformer()), ("estimator", last_step)])
+    assert pipe.fit([[1]], [1]).transform([[1], [2], [3]]) == [[1], [2], [3]]
+
+
 # End of routing tests
 # ====================
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 7482fd6022e50..26267a1355f6f 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -193,7 +193,7 @@ def _support_missing_values(self, X):
             and self.monotonic_cst is None
         )
 
-    def _compute_missing_values_in_feature_mask(self, X):
+    def _compute_missing_values_in_feature_mask(self, X, estimator_name=None):
         """Return boolean mask denoting if there are missing values for each feature.
 
         This method also ensures that X is finite.
@@ -203,13 +203,17 @@ def _compute_missing_values_in_feature_mask(self, X):
         X : array-like of shape (n_samples, n_features), dtype=DOUBLE
             Input data.
 
+        estimator_name : str or None, default=None
+            Name to use when raising an error. Defaults to the class name.
+
         Returns
         -------
         missing_values_in_feature_mask : ndarray of shape (n_features,), or None
             Missing value mask. If missing values are not supported or there
             are no missing values, return None.
         """
-        common_kwargs = dict(estimator_name=self.__class__.__name__, input_name="X")
+        estimator_name = estimator_name or self.__class__.__name__
+        common_kwargs = dict(estimator_name=estimator_name, input_name="X")
 
         if not self._support_missing_values(X):
             assert_all_finite(X, **common_kwargs)
diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py
index 4a23f4d2da946..b43ce1712709d 100644
--- a/sklearn/tree/_export.py
+++ b/sklearn/tree/_export.py
@@ -85,7 +85,7 @@ def __repr__(self):
         "decision_tree": [DecisionTreeClassifier, DecisionTreeRegressor],
         "max_depth": [Interval(Integral, 0, None, closed="left"), None],
         "feature_names": [list, None],
-        "class_names": [list, None],
+        "class_names": ["array-like", "boolean", None],
         "label": [StrOptions({"all", "root", "none"})],
         "filled": ["boolean"],
         "impurity": ["boolean"],
@@ -140,7 +140,7 @@ def plot_tree(
         Names of each of the features.
         If None, generic names will be used ("x[0]", "x[1]", ...).
 
-    class_names : list of str or bool, default=None
+    class_names : array-like of str or True, default=None
         Names of each of the target classes in ascending numerical order.
         Only relevant for classification and not supported for multi-output.
         If ``True``, shows a symbolic representation of the class name.
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 87173032a3bd3..f8b4d2042223c 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -188,7 +188,7 @@ def _array_indexing(array, key, key_dtype, axis):
         key = np.asarray(key)
     if isinstance(key, tuple):
         key = list(key)
-    return array[key] if axis == 0 else array[:, key]
+    return array[key, ...] if axis == 0 else array[:, key]
 
 
 def _pandas_indexing(X, key, key_dtype, axis):
diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py
index ca0d9fcaf1509..ed16ce767a0cd 100644
--- a/sklearn/utils/_array_api.py
+++ b/sklearn/utils/_array_api.py
@@ -186,6 +186,9 @@ def __init__(self, array_namespace):
     def __getattr__(self, name):
         return getattr(self._namespace, name)
 
+    def __eq__(self, other):
+        return self._namespace == other._namespace
+
     def take(self, X, indices, *, axis=0):
         # When array_api supports `take` we can use this directly
         # https://github.com/data-apis/array-api/issues/177
diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index 5affa4616be01..fb3912b27dbfe 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -177,7 +177,7 @@ def _unique_python(values, *, return_inverse, return_counts):
     except TypeError:
         types = sorted(t.__qualname__ for t in set(type(v) for v in values))
         raise TypeError(
-            "Encoders require their input to be uniformly "
+            "Encoders require their input argument must be uniformly "
             f"strings or numbers. Got {types}"
         )
     ret = (uniques,)
diff --git a/sklearn/utils/_estimator_html_repr.py b/sklearn/utils/_estimator_html_repr.py
index e9b95666cdd32..207096823cae6 100644
--- a/sklearn/utils/_estimator_html_repr.py
+++ b/sklearn/utils/_estimator_html_repr.py
@@ -190,13 +190,35 @@ def _write_estimator_html(
 
 _STYLE = """
 #$id {
-  color: black;
+  --sklearn-color-text: black;
+  --sklearn-color-line: gray;
+  --sklearn-color-background: white;
+  --sklearn-color-background-box: #f0f8ff;
+  --sklearn-color-border-box: black;
+  --sklearn-color-icon: #696969;
+  --sklearn-color-active: #d4ebff;
+  --sklearn-color-highlight: #d4ebff;
+
+  @media (prefers-color-scheme: dark) {
+    --sklearn-color-text: white;
+    --sklearn-color-line: gray;
+    --sklearn-color-background: #111;
+    --sklearn-color-background-box: #424242;
+    --sklearn-color-border-box: white;
+    --sklearn-color-icon: #878787;
+    --sklearn-color-active: #616161;
+    --sklearn-color-highlight: #616161;
+  }
+}
+
+#$id {
+  color: var(--sklearn-color-text);
 }
 #$id pre{
   padding: 0;
 }
 #$id div.sk-toggleable {
-  background-color: white;
+  background-color: var(--sklearn-color-background);
 }
 #$id label.sk-toggleable__label {
   cursor: pointer;
@@ -211,26 +233,26 @@ def _write_estimator_html(
   content: "▸";
   float: left;
   margin-right: 0.25em;
-  color: #696969;
+  color: var(--sklearn-color-icon);
 }
 #$id label.sk-toggleable__label-arrow:hover:before {
-  color: black;
+  color: var(--sklearn-color-text);
 }
 #$id div.sk-estimator:hover label.sk-toggleable__label-arrow:before {
-  color: black;
+  color: var(--sklearn-color-text);
 }
 #$id div.sk-toggleable__content {
   max-height: 0;
   max-width: 0;
   overflow: hidden;
   text-align: left;
-  background-color: #f0f8ff;
+  background-color: var(--sklearn-color-background-box);
 }
 #$id div.sk-toggleable__content pre {
   margin: 0.2em;
-  color: black;
+  color: var(--sklearn-color-text);
   border-radius: 0.25em;
-  background-color: #f0f8ff;
+  background-color: var(--sklearn-color-background-box);
 }
 #$id input.sk-toggleable__control:checked~div.sk-toggleable__content {
   max-height: 200px;
@@ -241,10 +263,10 @@ def _write_estimator_html(
   content: "▾";
 }
 #$id div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {
-  background-color: #d4ebff;
+  background-color: var(--sklearn-color-active);
 }
 #$id div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {
-  background-color: #d4ebff;
+  background-color: var(--sklearn-color-active);
 }
 #$id input.sk-hidden--visually {
   border: 0;
@@ -259,28 +281,28 @@ def _write_estimator_html(
 }
 #$id div.sk-estimator {
   font-family: monospace;
-  background-color: #f0f8ff;
-  border: 1px dotted black;
+  background-color: var(--sklearn-color-background-box);
+  border: 1px dotted var(--sklearn-color-border-box);
   border-radius: 0.25em;
   box-sizing: border-box;
   margin-bottom: 0.5em;
 }
 #$id div.sk-estimator:hover {
-  background-color: #d4ebff;
+  background-color: var(--sklearn-color-highlight);
 }
 #$id div.sk-parallel-item::after {
   content: "";
   width: 100%;
-  border-bottom: 1px solid gray;
+  border-bottom: 1px solid var(--sklearn-color-line);
   flex-grow: 1;
 }
 #$id div.sk-label:hover label.sk-toggleable__label {
-  background-color: #d4ebff;
+  background-color: var(--sklearn-color-highlight);
 }
 #$id div.sk-serial::before {
   content: "";
   position: absolute;
-  border-left: 1px solid gray;
+  border-left: 1px solid var(--sklearn-color-line);
   box-sizing: border-box;
   top: 0;
   bottom: 0;
@@ -291,7 +313,7 @@ def _write_estimator_html(
   display: flex;
   flex-direction: column;
   align-items: center;
-  background-color: white;
+  background-color: var(--sklearn-color-background);
   padding-right: 0.2em;
   padding-left: 0.2em;
   position: relative;
@@ -304,13 +326,13 @@ def _write_estimator_html(
   display: flex;
   align-items: stretch;
   justify-content: center;
-  background-color: white;
+  background-color: var(--sklearn-color-background);
   position: relative;
 }
 #$id div.sk-item::before, #$id div.sk-parallel-item::before {
   content: "";
   position: absolute;
-  border-left: 1px solid gray;
+  border-left: 1px solid var(--sklearn-color-line);
   box-sizing: border-box;
   top: 0;
   bottom: 0;
@@ -322,7 +344,7 @@ def _write_estimator_html(
   flex-direction: column;
   z-index: 1;
   position: relative;
-  background-color: white;
+  background-color: var(--sklearn-color-background);
 }
 #$id div.sk-parallel-item:first-child::after {
   align-self: flex-end;
@@ -336,11 +358,11 @@ def _write_estimator_html(
   width: 0;
 }
 #$id div.sk-dashed-wrapped {
-  border: 1px dashed gray;
+  border: 1px dashed var(--sklearn-color-line);
   margin: 0 0.4em 0.5em 0.4em;
   box-sizing: border-box;
   padding-bottom: 0.4em;
-  background-color: white;
+  background-color: var(--sklearn-color-background);
 }
 #$id div.sk-label label {
   font-family: monospace;
diff --git a/sklearn/utils/_metadata_requests.py b/sklearn/utils/_metadata_requests.py
index 17d8e37510e48..1a9c07438b17a 100644
--- a/sklearn/utils/_metadata_requests.py
+++ b/sklearn/utils/_metadata_requests.py
@@ -80,7 +80,7 @@
 import inspect
 from collections import namedtuple
 from copy import deepcopy
-from typing import Optional, Union
+from typing import TYPE_CHECKING, Optional, Union
 from warnings import warn
 
 from .. import get_config
@@ -89,6 +89,9 @@
 
 # Only the following methods are supported in the routing mechanism. Adding new
 # methods at the moment involves monkeypatching this list.
+# Note that if this list is changed or monkeypatched, the corresponding method
+# needs to be added under a TYPE_CHECKING condition like the one done here in
+# _MetadataRequester
 SIMPLE_METHODS = [
     "fit",
     "partial_fit",
@@ -1251,6 +1254,27 @@ class _MetadataRequester:
     .. versionadded:: 1.3
     """
 
+    if TYPE_CHECKING:  # pragma: no cover
+        # This code is never run in runtime, but it's here for type checking.
+        # Type checkers fail to understand that the `set_{method}_request`
+        # methods are dynamically generated, and they complain that they are
+        # not defined. We define them here to make type checkers happy.
+        # During type checking analyzers assume this to be True.
+        # The following list of defined methods mirrors the list of methods
+        # in SIMPLE_METHODS.
+        # fmt: off
+        def set_fit_request(self, **kwargs): pass
+        def set_partial_fit_request(self, **kwargs): pass
+        def set_predict_request(self, **kwargs): pass
+        def set_predict_proba_request(self, **kwargs): pass
+        def set_predict_log_proba_request(self, **kwargs): pass
+        def set_decision_function_request(self, **kwargs): pass
+        def set_score_request(self, **kwargs): pass
+        def set_split_request(self, **kwargs): pass
+        def set_transform_request(self, **kwargs): pass
+        def set_inverse_transform_request(self, **kwargs): pass
+        # fmt: on
+
     def __init_subclass__(cls, **kwargs):
         """Set the ``set_{method}_request`` methods.
 
@@ -1412,7 +1436,11 @@ def get_metadata_routing(self):
 # given metadata. This is to minimize the boilerplate required in routers.
 
 
-def process_routing(obj, method, other_params, **kwargs):
+# Here the first two arguments are positional only which makes everything
+# passed as keyword argument a metadata. The first two args also have an `_`
+# prefix to reduce the chances of name collisions with the passed metadata, and
+# since they're positional only, users will never type those underscores.
+def process_routing(_obj, _method, /, **kwargs):
     """Validate and route input parameters.
 
     This function is used inside a router's method, e.g. :term:`fit`,
@@ -1420,26 +1448,21 @@ def process_routing(obj, method, other_params, **kwargs):
 
     Assuming this signature: ``fit(self, X, y, sample_weight=None, **fit_params)``,
     a call to this function would be:
-    ``process_routing(self, fit_params, sample_weight=sample_weight)``.
+    ``process_routing(self, sample_weight=sample_weight, **fit_params)``.
 
     .. versionadded:: 1.3
 
     Parameters
     ----------
-    obj : object
+    _obj : object
         An object implementing ``get_metadata_routing``. Typically a
         meta-estimator.
 
-    method : str
+    _method : str
         The name of the router's method in which this function is called.
 
-    other_params : dict
-        A dictionary of extra parameters passed to the router's method,
-        e.g. ``**fit_params`` passed to a meta-estimator's :term:`fit`.
-
     **kwargs : dict
-        Parameters explicitly accepted and included in the router's method
-        signature.
+        Metadata to be routed.
 
     Returns
     -------
@@ -1449,27 +1472,20 @@ def process_routing(obj, method, other_params, **kwargs):
         corresponding methods or corresponding child objects. The object names
         are those defined in `obj.get_metadata_routing()`.
     """
-    if not hasattr(obj, "get_metadata_routing"):
+    if not (hasattr(_obj, "get_metadata_routing") or isinstance(_obj, MetadataRouter)):
         raise AttributeError(
-            f"This {repr(obj.__class__.__name__)} has not implemented the routing"
-            " method `get_metadata_routing`."
+            f"The given object ({repr(_obj.__class__.__name__)}) needs to either"
+            " implement the routing method `get_metadata_routing` or be a"
+            " `MetadataRouter` instance."
         )
-    if method not in METHODS:
+    if _method not in METHODS:
         raise TypeError(
             f"Can only route and process input on these methods: {METHODS}, "
-            f"while the passed method is: {method}."
+            f"while the passed method is: {_method}."
         )
 
-    # We take the extra params (**fit_params) which is passed as `other_params`
-    # and add the explicitly passed parameters (passed as **kwargs) to it. This
-    # is equivalent to a code such as this in a router:
-    # if sample_weight is not None:
-    #     fit_params["sample_weight"] = sample_weight
-    all_params = other_params if other_params is not None else dict()
-    all_params.update(kwargs)
-
-    request_routing = get_routing_for_object(obj)
-    request_routing.validate_metadata(params=all_params, method=method)
-    routed_params = request_routing.route_params(params=all_params, caller=method)
+    request_routing = get_routing_for_object(_obj)
+    request_routing.validate_metadata(params=kwargs, method=_method)
+    routed_params = request_routing.route_params(params=kwargs, caller=_method)
 
     return routed_params
diff --git a/sklearn/utils/_set_output.py b/sklearn/utils/_set_output.py
index 9eee7c370e341..bb289535c45ec 100644
--- a/sklearn/utils/_set_output.py
+++ b/sklearn/utils/_set_output.py
@@ -5,6 +5,7 @@
 from .._config import get_config
 from . import check_pandas_support
 from ._available_if import available_if
+from .validation import _is_pandas_df
 
 
 def _wrap_in_pandas_container(
@@ -125,9 +126,10 @@ def _wrap_data_with_container(method, data_to_wrap, original_input, estimator):
         return data_to_wrap
 
     # dense_config == "pandas"
+    index = original_input.index if _is_pandas_df(original_input) else None
     return _wrap_in_pandas_container(
         data_to_wrap=data_to_wrap,
-        index=getattr(original_input, "index", None),
+        index=index,
         columns=estimator.get_feature_names_out,
     )
 
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index 8b54df9f25b72..bf558ff7e6dd4 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -38,6 +38,7 @@
     assert_array_almost_equal,
     assert_array_equal,
     assert_array_less,
+    assert_no_warnings,
 )
 
 import sklearn
@@ -65,6 +66,7 @@
     "assert_approx_equal",
     "assert_allclose",
     "assert_run_python_script",
+    "assert_no_warnings",
     "SkipTest",
 ]
 
@@ -80,32 +82,6 @@
 assert_raises_regexp = assert_raises_regex
 
 
-# To remove when we support numpy 1.7
-def assert_no_warnings(func, *args, **kw):
-    """
-    Parameters
-    ----------
-    func
-    *args
-    **kw
-    """
-    # very important to avoid uncontrolled state propagation
-    with warnings.catch_warnings(record=True) as w:
-        warnings.simplefilter("always")
-
-        result = func(*args, **kw)
-        if hasattr(np, "FutureWarning"):
-            # Filter out numpy-specific warnings in numpy >= 1.9
-            w = [e for e in w if e.category is not np.VisibleDeprecationWarning]
-
-        if len(w) > 0:
-            raise AssertionError(
-                "Got warnings when calling %s: [%s]"
-                % (func.__name__, ", ".join(str(warning) for warning in w))
-            )
-    return result
-
-
 def ignore_warnings(obj=None, category=Warning):
     """Context manager and decorator to ignore warnings.
 
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index e77197e24a69e..53ae056b4d2f7 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -1350,7 +1350,10 @@ def check_dtype_object(name, estimator_orig):
 
     if "string" not in tags["X_types"]:
         X[0, 0] = {"foo": "bar"}
-        msg = "argument must be a string.* number"
+        # This error is raised by:
+        # - `np.asarray` in `check_array`
+        # - `_unique_python` for encoders
+        msg = "argument must be .* string.* number"
         with raises(TypeError, match=msg):
             estimator.fit(X, y)
     else:
@@ -3542,7 +3545,6 @@ def _enforce_estimator_tags_y(estimator, y):
         # Create strictly positive y. The minimal increment above 0 is 1, as
         # y could be of integer dtype.
         y += 1 + abs(y.min())
-    # Estimators with a `binary_only` tag only accept up to two unique y values
     if _safe_tags(estimator, key="binary_only") and y.size > 0:
         y = np.where(y == y.flat[0], y, y.flat[0] + 1)
     # Estimators in mono_output_task_error raise ValueError if y is of 1-D
@@ -3562,7 +3564,8 @@ def _enforce_estimator_tags_X(estimator, X, kernel=linear_kernel):
     if _safe_tags(estimator, key="requires_positive_X"):
         X = X - X.min()
     if "categorical" in _safe_tags(estimator, key="X_types"):
-        X = (X - X.min()).astype(np.int32)
+        dtype = np.float64 if _safe_tags(estimator, key="allow_nan") else np.int32
+        X = np.round((X - X.min())).astype(dtype)
 
     if estimator.__class__.__name__ == "SkewedChi2Sampler":
         # SkewedChi2Sampler requires X > -skewdness in transform
@@ -4584,7 +4587,7 @@ def check_set_output_transform_pandas(name, transformer_orig):
         outputs_pandas = _output_from_fit_transform(transformer_pandas, name, X, df, y)
     except ValueError as e:
         # transformer does not support sparse data
-        assert str(e) == "Pandas output does not support sparse data.", e
+        assert "Pandas output does not support sparse data." in str(e), e
         return
 
     for case in outputs_default:
@@ -4630,7 +4633,7 @@ def check_global_output_transform_pandas(name, transformer_orig):
             )
     except ValueError as e:
         # transformer does not support sparse data
-        assert str(e) == "Pandas output does not support sparse data.", e
+        assert "Pandas output does not support sparse data." in str(e), e
         return
 
     for case in outputs_default:
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index 2202a1daaf90a..d33b638358157 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -158,3 +158,11 @@ def _contents(data_module):
         )
     else:
         return resources.contents(data_module)
+
+
+# For +1.25 NumPy versions exceptions and warnings are being moved
+# to a dedicated submodule.
+if np_version >= parse_version("1.25.0"):
+    from numpy.exceptions import VisibleDeprecationWarning
+else:
+    from numpy import VisibleDeprecationWarning  # type: ignore  # noqa
diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py
index 892d77c7e01e5..1f46f6400df98 100644
--- a/sklearn/utils/multiclass.py
+++ b/sklearn/utils/multiclass.py
@@ -14,6 +14,7 @@
 from scipy.sparse import issparse
 
 from ..utils._array_api import get_namespace
+from ..utils.fixes import VisibleDeprecationWarning
 from .validation import _assert_all_finite, check_array
 
 
@@ -161,10 +162,10 @@ def is_multilabel(y):
             ensure_min_features=0,
         )
         with warnings.catch_warnings():
-            warnings.simplefilter("error", np.VisibleDeprecationWarning)
+            warnings.simplefilter("error", VisibleDeprecationWarning)
             try:
                 y = check_array(y, dtype=None, **check_y_kwargs)
-            except (np.VisibleDeprecationWarning, ValueError) as e:
+            except (VisibleDeprecationWarning, ValueError) as e:
                 if str(e).startswith("Complex data not supported"):
                     raise
 
@@ -324,11 +325,11 @@ def type_of_target(y, input_name=""):
     )
 
     with warnings.catch_warnings():
-        warnings.simplefilter("error", np.VisibleDeprecationWarning)
+        warnings.simplefilter("error", VisibleDeprecationWarning)
         if not issparse(y):
             try:
                 y = check_array(y, dtype=None, **check_y_kwargs)
-            except (np.VisibleDeprecationWarning, ValueError) as e:
+            except (VisibleDeprecationWarning, ValueError) as e:
                 if str(e).startswith("Complex data not supported"):
                     raise
 
diff --git a/sklearn/utils/tests/test_estimator_html_repr.py b/sklearn/utils/tests/test_estimator_html_repr.py
index e4327dcbc2c46..bbe44ac8974fa 100644
--- a/sklearn/utils/tests/test_estimator_html_repr.py
+++ b/sklearn/utils/tests/test_estimator_html_repr.py
@@ -197,6 +197,9 @@ def test_estimator_html_repr_pipeline():
             assert f"<label>{html.escape(name)}</label>" in html_output
             assert f"<pre>{html.escape(str(est))}</pre>" in html_output
 
+    # verify that prefers-color-scheme is implemented
+    assert "prefers-color-scheme" in html_output
+
 
 @pytest.mark.parametrize("final_estimator", [None, LinearSVC()])
 def test_stacking_classifier(final_estimator):
diff --git a/sklearn/utils/tests/test_pprint.py b/sklearn/utils/tests/test_pprint.py
index a4aaa8f21b6b7..ec48c4a012574 100644
--- a/sklearn/utils/tests/test_pprint.py
+++ b/sklearn/utils/tests/test_pprint.py
@@ -12,7 +12,7 @@
 
 
 # Ignore flake8 (lots of line too long issues)
-# flake8: noqa
+# ruff: noqa
 
 
 # Constructors excerpted to test pprinting
diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py
index 403a5db63ec54..d1722a1553f9c 100644
--- a/sklearn/utils/tests/test_set_output.py
+++ b/sklearn/utils/tests/test_set_output.py
@@ -315,3 +315,32 @@ def test_set_output_named_tuple_out():
     assert isinstance(X_trans, Output)
     assert_array_equal(X_trans.X, X)
     assert_array_equal(X_trans.Y, 2 * X)
+
+
+class EstimatorWithListInput(_SetOutputMixin):
+    def fit(self, X, y=None):
+        assert isinstance(X, list)
+        self.n_features_in_ = len(X[0])
+        return self
+
+    def transform(self, X, y=None):
+        return X
+
+    def get_feature_names_out(self, input_features=None):
+        return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object)
+
+
+def test_set_output_list_input():
+    """Check set_output for list input.
+
+    Non-regression test for #27037.
+    """
+    pd = pytest.importorskip("pandas")
+
+    X = [[0, 1, 2, 3], [4, 5, 6, 7]]
+    est = EstimatorWithListInput()
+    est.set_output(transform="pandas")
+
+    X_out = est.fit(X).transform(X)
+    assert isinstance(X_out, pd.DataFrame)
+    assert_array_equal(X_out.columns, ["X0", "X1", "X2", "X3"])