diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index 6ae02da33..000000000 --- a/.coveragerc +++ /dev/null @@ -1,9 +0,0 @@ -[run] -branch = True -source = mlxtend -include = */mlxtend/* -omit = - */mlxtend/data/* - */mlxtend/general_plotting/* - */mlxtend/externals/* - */setup.py diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index ecf844c5d..b6eba040e 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -33,9 +33,13 @@ jobs: conda install imageio scikit-image -y -q conda install dlib -y -q pip install markdown + pip install coverage pip install -e . python -c "import numpy; print('NumPy:', numpy.__version__)" python -c "import scipy; print('SciPy:', scipy.__version__)" python -c "import sklearn; print('Scikit-learn:', sklearn.__version__)" python -c "import pandas; print('Pandas:', pandas.__version__)" - pytest -sv + coverage run --source=mlxtend --branch -m pytest mlxtend + coverage xml + - name: Upload Coverage to Codecov + uses: codecov/codecov-action@v2 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 4686ce552..7a1416677 100755 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ # Test Suites +htmlcov .coverage* .pytest_cache/ diff --git a/.pep8speaks.yml b/.pep8speaks.yml index dbbbf7cb7..46e73f004 100644 --- a/.pep8speaks.yml +++ b/.pep8speaks.yml @@ -7,7 +7,8 @@ scanner: flake8: # Same as scanner.linter value. Other option is flake8 max-line-length: 88 # Default is 79 in PEP 8 ignore: # Errors and warnings to ignore - - W504 # line break after binary operator + - W504 + - W503 # line break after binary operator no_blank_comment: False # If True, no comment is made on PR without any errors. descending_issues_order: False # If True, PEP 8 issues in message will be displayed in descending order of line numbers in the file diff --git a/README.md b/README.md index 9ebf1b854..bc7dec309 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ [![DOI](https://joss.theoj.org/papers/10.21105/joss.00638/status.svg)](https://doi.org/10.21105/joss.00638) [![PyPI version](https://badge.fury.io/py/mlxtend.svg)](http://badge.fury.io/py/mlxtend) [![Anaconda-Server Badge](https://anaconda.org/conda-forge/mlxtend/badges/version.svg)](https://anaconda.org/conda-forge/mlxtend) -[![Build statu s](https://ci.appveyor.com/api/projects/status/7vx20e0h5dxcyla2/branch/master?svg=true)](https://ci.appveyor.com/project/rasbt/mlxtend/branch/master) -[![Coverage Status](https://coveralls.io/repos/rasbt/mlxtend/badge.svg?branch=master&service=github)](https://coveralls.io/github/rasbt/mlxtend?branch=master) +[![Build status](https://ci.appveyor.com/api/projects/status/7vx20e0h5dxcyla2/branch/master?svg=true)](https://ci.appveyor.com/project/rasbt/mlxtend/branch/master) +[![codecov](https://codecov.io/gh/rasbt/mlxtend/branch/master/graph/badge.svg)](https://codecov.io/gh/rasbt/mlxtend) ![Python 3](https://img.shields.io/badge/python-3-blue.svg) ![License](https://img.shields.io/badge/license-BSD-blue.svg) [![Discuss](https://img.shields.io/badge/discuss-github-blue.svg)](https://github.com/rasbt/mlxtend/discussions) diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index f31d68d26..eb2dd86a5 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -22,7 +22,7 @@ The CHANGELOG for the current development version is available at ##### New Features and Enhancements - The `mlxtend.evaluate.feature_importance_permutation` function has a new `feature_groups` argument to treat user-specified feature groups as single features, which is useful for one-hot encoded features. ([#955](https://github.com/rasbt/mlxtend/pull/955)) -- The `mlxtend.feature_selection.ExhaustiveFeatureSelector` also gained support for `feature_groups` with a behavior similar to the one described above. ([#957](https://github.com/rasbt/mlxtend/pull/957) via [Nima Sarajpoor](https://github.com/NimaSarajpoor)) +- The `mlxtend.feature_selection.ExhaustiveFeatureSelector` and `SequentialFeatureSelector` also gained support for `feature_groups` with a behavior similar to the one described above. ([#957](https://github.com/rasbt/mlxtend/pull/957) and [#965](https://github.com/rasbt/mlxtend/pull/965) via [Nima Sarajpoor](https://github.com/NimaSarajpoor)) ##### Changes @@ -33,6 +33,7 @@ The CHANGELOG for the current development version is available at - None + ### Version 0.20.0 #### New Features and Enhancements diff --git a/docs/sources/CONTRIBUTING.md b/docs/sources/CONTRIBUTING.md index 1308de944..c4f0b8301 100755 --- a/docs/sources/CONTRIBUTING.md +++ b/docs/sources/CONTRIBUTING.md @@ -22,9 +22,20 @@ and checking off items as you go. 5. [ ] Add appropriate unit test functions in `mlxtend/*/tests` 6. [ ] Run `PYTHONPATH='.' pytest ./mlxtend -sv` and make sure that all unit tests pass -7. [ ] Modify documentation in the appropriate location under `mlxtend/docs/sources/` +7. [ ] Make sure the newly implemented feature has good test coverage: -8. [ ] Add a note about the modification/contribution to the `./docs/sources/changelog.md` file +``` +python -m pip install coverage +# test all: +# coverage run --source=mlxtend --branch -m pytest . +coverage run --source=mlxtend --branch -m pytest mlxtend/ +coverage html +``` + + +8. [ ] Modify documentation in the appropriate location under `mlxtend/docs/sources/` + +9. [ ] Add a note about the modification/contribution to the `./docs/sources/changelog.md` file diff --git a/docs/sources/user_guide/feature_selection/ExhaustiveFeatureSelector.ipynb b/docs/sources/user_guide/feature_selection/ExhaustiveFeatureSelector.ipynb index 4842a28a4..c7a2273c8 100644 --- a/docs/sources/user_guide/feature_selection/ExhaustiveFeatureSelector.ipynb +++ b/docs/sources/user_guide/feature_selection/ExhaustiveFeatureSelector.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -22,7 +22,7 @@ "text": [ "Author: Sebastian Raschka\n", "\n", - "Last updated: 2022-07-26\n", + "Last updated: 2022-09-13\n", "\n", "Python implementation: CPython\n", "Python version : 3.9.7\n", @@ -30,7 +30,7 @@ "\n", "matplotlib: 3.5.2\n", "numpy : 1.22.1\n", - "scipy : 1.7.3\n", + "scipy : 1.9.1\n", "mlxtend : 0.21.0.dev0\n", "\n" ] @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -120,7 +120,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -181,7 +181,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -260,7 +260,7 @@ "4 5.0 3.6 1.4 0.2" ] }, - "execution_count": 4, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -274,7 +274,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -318,7 +318,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -389,7 +389,7 @@ " 'Petal width')}}" ] }, - "execution_count": 6, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -421,7 +421,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -669,7 +669,7 @@ "1 0.037118 " ] }, - "execution_count": 7, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -704,7 +704,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -771,7 +771,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -878,7 +878,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -963,7 +963,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -974,7 +974,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -995,7 +995,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -1011,7 +1011,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -1037,7 +1037,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -1057,7 +1057,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -1084,7 +1084,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -1101,7 +1101,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 45, "metadata": {}, "outputs": [ { @@ -1139,7 +1139,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -1163,7 +1163,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -1218,7 +1218,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 48, "metadata": {}, "outputs": [ { @@ -1249,7 +1249,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -1271,7 +1271,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 50, "metadata": {}, "outputs": [ { @@ -1290,13 +1290,14 @@ " multi_class='multinomial',\n", " random_state=123,\n", " solver='newton-cg'),\n", - " max_features=3, min_features=2, print_progress=False)),\n", + " feature_groups=[[0], [1], [2], [3]], max_features=3,\n", + " min_features=2, print_progress=False)),\n", " ('logisticregression',\n", " LogisticRegression(multi_class='multinomial', random_state=123,\n", " solver='newton-cg'))]" ] }, - "execution_count": 23, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -1315,7 +1316,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 51, "metadata": {}, "outputs": [ { @@ -1339,7 +1340,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 52, "metadata": {}, "outputs": [ { @@ -1356,7 +1357,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 53, "metadata": {}, "outputs": [ { @@ -1365,7 +1366,7 @@ "{'exhaustivefeatureselector__estimator__C': 0.1}" ] }, - "execution_count": 26, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } @@ -1383,7 +1384,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 54, "metadata": {}, "outputs": [ { @@ -1417,7 +1418,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 55, "metadata": {}, "outputs": [ { @@ -1464,17 +1465,315 @@ "print('Best subset (corresponding names):', efs1.best_feature_names_)" ] }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Example 8 - Interrupting Long Runs for Intermediate Results" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## API" + "If your run is taking too long, it is possible to trigger a `KeyboardInterrupt` (e.g., ctrl+c on a Mac, or interrupting the cell in a Jupyter notebook) to obtain temporary results." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Toy dataset**" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import make_classification\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "\n", + "X, y = make_classification(\n", + " n_samples=200000,\n", + " n_features=6,\n", + " n_informative=2,\n", + " n_redundant=1,\n", + " n_repeated=1,\n", + " n_clusters_per_class=2,\n", + " flip_y=0.05,\n", + " class_sep=0.5,\n", + " random_state=123,\n", + ")\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=123\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Long run with interruption**" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Features: 56/56" + ] + } + ], + "source": [ + "from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "model = LogisticRegression(max_iter=10000)\n", + "\n", + "efs1 = EFS(model, \n", + " min_features=1, \n", + " max_features=4,\n", + " print_progress=True,\n", + " scoring='accuracy')\n", + "\n", + "efs1 = efs1.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Finalizing the fit**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the feature selection run hasn't finished, so certain attributes may not be available. In order to use the EFS instance, it is recommended to call `finalize_fit`, which will make EFS estimator appear as \"fitted\" process the temporary results:" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "efs1.finalize_fit()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best accuracy score: 0.73\n", + "Best subset (indices): (1, 2)\n" + ] + } + ], + "source": [ + "print('Best accuracy score: %.2f' % efs1.best_score_)\n", + "print('Best subset (indices):', efs1.best_idx_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 9 - Working with Feature Groups" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since mlxtend v0.21.0, it is possible to specify feature groups. Feature groups allow you to group certain features together, such that they are always selected as a group. This can be very useful in contexts similar to one-hot encoding -- if you want to treat the one-hot encoded feature as a single feature:\n", + "\n", + "![](SequentialFeatureSelector_files/feature_groups.jpeg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the following example, we specify sepal length and sepal width as a feature group so that they are always selected together:" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal lenpetal lensepal widpetal wid
05.13.51.40.2
14.93.01.40.2
24.73.21.30.2
34.63.11.50.2
45.03.61.40.2
\n", + "
" + ], + "text/plain": [ + " sepal len petal len sepal wid petal wid\n", + "0 5.1 3.5 1.4 0.2\n", + "1 4.9 3.0 1.4 0.2\n", + "2 4.7 3.2 1.3 0.2\n", + "3 4.6 3.1 1.5 0.2\n", + "4 5.0 3.6 1.4 0.2" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.datasets import load_iris\n", + "import pandas as pd\n", + "\n", + "iris = load_iris()\n", + "X = iris.data\n", + "y = iris.target\n", + "\n", + "X_df = pd.DataFrame(X, columns=['sepal len', 'petal len',\n", + " 'sepal wid', 'petal wid'])\n", + "X_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Features: 3/3" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best accuracy score: 0.97\n", + "Best subset (indices): (0, 2, 3)\n", + "Best subset (corresponding names): ('sepal len', 'sepal wid', 'petal wid')\n" + ] + } + ], + "source": [ + "from sklearn.neighbors import KNeighborsClassifier\n", + "from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS\n", + "\n", + "knn = KNeighborsClassifier(n_neighbors=3)\n", + "\n", + "efs1 = EFS(knn, \n", + " min_features=2,\n", + " max_features=2,\n", + " scoring='accuracy',\n", + " feature_groups=[['sepal len', 'sepal wid'], ['petal len'], ['petal wid']],\n", + " cv=3)\n", + "\n", + "efs1 = efs1.fit(X_df, y)\n", + "\n", + "print('Best accuracy score: %.2f' % efs1.best_score_)\n", + "print('Best subset (indices):', efs1.best_idx_)\n", + "print('Best subset (corresponding names):', efs1.best_feature_names_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that the returned number of features is 3, since the number of `min_features` and `max_features` corresponds to the number of feature groups. I.e., we have 2 feature groups in `['sepal len', 'sepal wid'], ['petal wid']`, but it expands to 3 features." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1574,10 +1873,16 @@ "- `feature_groups` : list or None (default: None)\n", "\n", " Optional argument for treating certain features as a group.\n", - " For example `[[1], [2], [3, 4, 5]]`, which can be useful for\n", + " This means, the features within a group are always selected together,\n", + " never split.\n", + " For example, `feature_groups=[[1], [2], [3, 4, 5]]`\n", + " specifies 3 feature groups.In this case,\n", + " possible feature selection results with `k_features=2`\n", + " are `[[1], [2]`, `[[1], [3, 4, 5]]`, or `[[2], [3, 4, 5]]`.\n", + " Feature groups can be useful for\n", " interpretability, for example, if features 3, 4, 5 are one-hot\n", - " encoded features. (for more details, please read the notes at the\n", - " bottom of this docstring). New in v 0.21.0.\n", + " encoded features. (For more details, please read the notes at the\n", + " bottom of this docstring). New in mlxtend v. 0.21.0.\n", "\n", "**Attributes**\n", "\n", @@ -1614,7 +1919,7 @@ " DataFrames are used in the `fit` method, the 'feature_names'\n", " correspond to the column names. Otherwise, the\n", " feature names are string representation of the feature\n", - " array indices. The 'feature_names' is new in v 0.13.0.\n", + " array indices. The 'feature_names' is new in v. 0.13.0.\n", "\n", "**Notes**\n", "\n", @@ -1629,6 +1934,11 @@ " linear regression, the coefficient of the feature 2 and 3 can be different\n", " even if they are considered as one group in feature_groups.\n", "\n", + " (3) If both fixed_features and feature_groups are specified, ensure that each\n", + " feature group contains the fixed_features selection. E.g., for a 3-feature set\n", + " fixed_features=[0, 1] and feature_groups=[[0, 1], [2]] is valid;\n", + " fixed_features=[0, 1] and feature_groups=[[0], [1, 2]] is not valid.\n", + "\n", "**Examples**\n", "\n", "For usage examples, please see\n", diff --git a/docs/sources/user_guide/feature_selection/SequentialFeatureSelector.ipynb b/docs/sources/user_guide/feature_selection/SequentialFeatureSelector.ipynb index 3c8944735..0f5e28a8f 100644 --- a/docs/sources/user_guide/feature_selection/SequentialFeatureSelector.ipynb +++ b/docs/sources/user_guide/feature_selection/SequentialFeatureSelector.ipynb @@ -22,18 +22,26 @@ "text": [ "Author: Sebastian Raschka\n", "\n", - "Last updated: 2022-01-04\n", + "Last updated: 2022-09-06\n", "\n", "Python implementation: CPython\n", - "Python version : 3.9.6\n", - "IPython version : 7.30.1\n", + "Python version : 3.9.7\n", + "IPython version : 8.0.1\n", "\n", - "matplotlib: 3.5.1\n", - "numpy : 1.22.0\n", - "scipy : 1.7.3\n", - "mlxtend : 0.20.0.dev0\n", + "matplotlib: 3.5.2\n", + "numpy : 1.22.1\n", + "scipy : 1.9.1\n", + "mlxtend : 0.20.0\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sebastianraschka/miniforge3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.1\n", + " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n" + ] } ], "source": [ @@ -82,25 +90,71 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Sequential feature selection algorithms are a family of greedy search algorithms that are used to reduce an initial *d*-dimensional feature space to a *k*-dimensional feature subspace where *k < d*. The motivation behind feature selection algorithms is to automatically select a subset of features that is most relevant to the problem. The goal of feature selection is two-fold: We want to improve the computational efficiency and reduce the generalization error of the model by removing irrelevant features or noise. A wrapper approach such as sequential feature selection is especially useful if embedded feature selection -- for example, a regularization penalty like LASSO -- is not applicable.\n", + "Sequential feature selection algorithms are a family of greedy search algorithms that are used to reduce an initial *d*-dimensional feature space to a *k*-dimensional feature subspace where *k < d*. The motivation behind feature selection algorithms is to automatically select a subset of features most relevant to the problem. The goal of feature selection is two-fold: We want to improve the computational efficiency and reduce the model's generalization error by removing irrelevant features or noise. In addition, a wrapper approach such as sequential feature selection is advantageous if embedded feature selection -- for example, a regularization penalty like LASSO -- is not applicable.\n", "\n", - "In a nutshell, SFAs remove or add one feature at the time based on the classifier performance until a feature subset of the desired size *k* is reached. There are 4 different flavors of SFAs available via the `SequentialFeatureSelector`:\n", + "In a nutshell, SFAs remove or add one feature at a time based on the classifier performance until a feature subset of the desired size *k* is reached. There are four different flavors of SFAs available via the `SequentialFeatureSelector`:\n", "\n", "1. Sequential Forward Selection (SFS)\n", "2. Sequential Backward Selection (SBS)\n", "3. Sequential Forward Floating Selection (SFFS)\n", "4. Sequential Backward Floating Selection (SBFS)\n", "\n", - "The ***floating*** variants, SFFS and SBFS, can be considered as extensions to the simpler SFS and SBS algorithms. The floating algorithms have an additional exclusion or inclusion step to remove features once they were included (or excluded), so that a larger number of feature subset combinations can be sampled. It is important to emphasize that this step is conditional and only occurs if the resulting feature subset is assessed as \"better\" by the criterion function after removal (or addition) of a particular feature. Furthermore, I added an optional check to skip the conditional exclusion steps if the algorithm gets stuck in cycles. \n", + "The ***floating*** variants, SFFS and SBFS, can be considered extensions to the simpler SFS and SBS algorithms. The floating algorithms have an additional exclusion or inclusion step to remove features once they were included (or excluded) so that a larger number of feature subset combinations can be sampled. It is important to emphasize that this step is conditional and only occurs if the resulting feature subset is assessed as \"better\" by the criterion function after the removal (or addition) of a particular feature. Furthermore, I added an optional check to skip the conditional exclusion steps if the algorithm gets stuck in cycles. \n", "\n", "\n", "---\n", "\n", "How is this different from *Recursive Feature Elimination* (RFE) -- e.g., as implemented in `sklearn.feature_selection.RFE`? RFE is computationally less complex using the feature weight coefficients (e.g., linear models) or feature importance (tree-based algorithms) to eliminate features recursively, whereas SFSs eliminate (or add) features based on a user-defined classifier/regression performance metric.\n", "\n", - "---\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tutorial Videos" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Visual Illustration" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A visual illustration of the sequential backward selection process is provided below, from the paper\n", + "\n", + "- Joe Bemister-Buffington, Alex J. Wolf, Sebastian Raschka, and Leslie A. Kuhn (2020)\n", + "Machine Learning to Identify Flexibility Signatures of Class A GPCR Inhibition\n", + "Biomolecules 2020, 10, 454. https://www.mdpi.com/2218-273X/10/3/454#\n", "\n", - "The SFAs are outlined in pseudo code below:" + "![](SequentialFeatureSelector_files/sbs-gpcr2020.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Algorithmic Details" ] }, { @@ -267,19 +321,6 @@ "**Termination:** stop when ***k*** equals the number of desired features\n" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A visual illustration of the sequential backward selection process is provided below, from the paper\n", - "\n", - "- Joe Bemister-Buffington, Alex J. Wolf, Sebastian Raschka, and Leslie A. Kuhn (2020)\n", - "Machine Learning to Identify Flexibility Signatures of Class A GPCR Inhibition\n", - "Biomolecules 2020, 10, 454. https://www.mdpi.com/2218-273X/10/3/454#\n", - "\n", - "![](SequentialFeatureSelector_files/sbs-gpcr2020.png)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -340,15 +381,15 @@ "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n", "[Parallel(n_jobs=1)]: Done 4 out of 4 | elapsed: 0.0s finished\n", "\n", - "[2022-01-04 08:41:07] Features: 1/3 -- score: 0.96[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", + "[2022-09-06 20:51:22] Features: 1/3 -- score: 0.96[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n", "[Parallel(n_jobs=1)]: Done 3 out of 3 | elapsed: 0.0s finished\n", "\n", - "[2022-01-04 08:41:07] Features: 2/3 -- score: 0.9733333333333334[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", + "[2022-09-06 20:51:22] Features: 2/3 -- score: 0.9733333333333334[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n", "[Parallel(n_jobs=1)]: Done 2 out of 2 | elapsed: 0.0s finished\n", "\n", - "[2022-01-04 08:41:07] Features: 3/3 -- score: 0.9733333333333334" + "[2022-09-06 20:51:22] Features: 3/3 -- score: 0.9733333333333334" ] } ], @@ -424,15 +465,15 @@ "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n", "[Parallel(n_jobs=1)]: Done 4 out of 4 | elapsed: 0.0s finished\n", "\n", - "[2022-01-04 08:41:07] Features: 1/3 -- score: 0.96[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", + "[2022-09-06 20:51:22] Features: 1/3 -- score: 0.96[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n", "[Parallel(n_jobs=1)]: Done 3 out of 3 | elapsed: 0.0s finished\n", "\n", - "[2022-01-04 08:41:07] Features: 2/3 -- score: 0.9733333333333334[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", + "[2022-09-06 20:51:22] Features: 2/3 -- score: 0.9733333333333334[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n", "[Parallel(n_jobs=1)]: Done 2 out of 2 | elapsed: 0.0s finished\n", "\n", - "[2022-01-04 08:41:07] Features: 3/3 -- score: 0.9733333333333334" + "[2022-09-06 20:51:22] Features: 3/3 -- score: 0.9733333333333334" ] }, { @@ -563,6 +604,26 @@ "execution_count": 10, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sebastianraschka/miniforge3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.1\n", + " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n", + "/Users/sebastianraschka/miniforge3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.1\n", + " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n", + "/Users/sebastianraschka/miniforge3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.1\n", + " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n", + "/Users/sebastianraschka/miniforge3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.1\n", + " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n", + "/Users/sebastianraschka/miniforge3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.1\n", + " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n", + "/Users/sebastianraschka/miniforge3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.1\n", + " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n", + "/Users/sebastianraschka/miniforge3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.1\n", + " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -581,7 +642,23 @@ "Sequential Forward Floating Selection (k=3):\n", "(1, 2, 3)\n", "CV Score:\n", - "0.9731507823613088\n", + "0.9731507823613088\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sebastianraschka/miniforge3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.1\n", + " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n", + "/Users/sebastianraschka/miniforge3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.1\n", + " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "\n", "Sequential Backward Floating Selection (k=3):\n", "(1, 2, 3)\n", @@ -986,19 +1063,19 @@ "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n", "[Parallel(n_jobs=1)]: Done 4 out of 4 | elapsed: 0.0s finished\n", "\n", - "[2022-01-04 08:41:08] Features: 1/4 -- score: 0.96[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", + "[2022-09-06 20:51:24] Features: 1/4 -- score: 0.96[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n", "[Parallel(n_jobs=1)]: Done 3 out of 3 | elapsed: 0.0s finished\n", "\n", - "[2022-01-04 08:41:08] Features: 2/4 -- score: 0.9666666666666668[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", + "[2022-09-06 20:51:24] Features: 2/4 -- score: 0.9666666666666668[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n", "[Parallel(n_jobs=1)]: Done 2 out of 2 | elapsed: 0.0s finished\n", "\n", - "[2022-01-04 08:41:08] Features: 3/4 -- score: 0.9533333333333334[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", + "[2022-09-06 20:51:24] Features: 3/4 -- score: 0.9533333333333334[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n", "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s finished\n", "\n", - "[2022-01-04 08:41:08] Features: 4/4 -- score: 0.9733333333333334" + "[2022-09-06 20:51:24] Features: 4/4 -- score: 0.9733333333333334" ] }, { @@ -1149,15 +1226,15 @@ "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n", "[Parallel(n_jobs=1)]: Done 4 out of 4 | elapsed: 0.0s finished\n", "\n", - "[2022-01-04 08:41:09] Features: 1/3 -- score: 0.9666666666666667[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", + "[2022-09-06 20:51:25] Features: 1/3 -- score: 0.9666666666666667[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n", "[Parallel(n_jobs=1)]: Done 3 out of 3 | elapsed: 0.0s finished\n", "\n", - "[2022-01-04 08:41:09] Features: 2/3 -- score: 0.9666666666666667[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", + "[2022-09-06 20:51:25] Features: 2/3 -- score: 0.9666666666666667[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n", "[Parallel(n_jobs=1)]: Done 2 out of 2 | elapsed: 0.0s finished\n", "\n", - "[2022-01-04 08:41:09] Features: 3/3 -- score: 0.9666666666666667" + "[2022-09-06 20:51:25] Features: 3/3 -- score: 0.9666666666666667" ] } ], @@ -1283,6 +1360,13 @@ "## Example 8 -- Sequential Feature Selection and GridSearch" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the following example, we are tuning the SFS's estimator using GridSearch. To avoid unwanted behavior or side-effects, it's recommended to use the estimator inside and outside of SFS as separate instances." + ] + }, { "cell_type": "code", "execution_count": 22, @@ -1301,15 +1385,6 @@ " X, y, test_size=0.2, random_state=123)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using the same estimator inside and outside SFS\n", - "\n", - "The typical usecase, to avoid overfitting, is to use the same estimator inside and outside the feature selection object. The following code shows how to tune this estimator inside the `SFS`:" - ] - }, { "cell_type": "code", "execution_count": 23, @@ -1322,6 +1397,7 @@ "import mlxtend\n", "\n", "knn1 = KNeighborsClassifier()\n", + "knn2 = KNeighborsClassifier()\n", "\n", "sfs1 = SFS(estimator=knn1, \n", " k_features=3,\n", @@ -1331,11 +1407,12 @@ " cv=5)\n", "\n", "pipe = Pipeline([('sfs', sfs1), \n", - " ('knn1', knn1)])\n", + " ('knn2', knn2)])\n", "\n", "param_grid = {\n", " 'sfs__k_features': [1, 2, 3],\n", - " 'sfs__estimator__n_neighbors': [3, 4, 7]\n", + " 'sfs__estimator__n_neighbors': [3, 4, 7], # inner knn\n", + " 'knn2__n_neighbors': [3, 4, 7] # outer knn\n", " }\n", " \n", "gs = GridSearchCV(estimator=pipe, \n", @@ -1357,26 +1434,8 @@ ] }, { - "cell_type": "code", - "execution_count": 24, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'sfs__estimator__n_neighbors': 3, 'sfs__k_features': 1} test acc.: 0.925\n", - "{'sfs__estimator__n_neighbors': 3, 'sfs__k_features': 2} test acc.: 0.9166666666666667\n", - "{'sfs__estimator__n_neighbors': 3, 'sfs__k_features': 3} test acc.: 0.95\n", - "{'sfs__estimator__n_neighbors': 4, 'sfs__k_features': 1} test acc.: 0.925\n", - "{'sfs__estimator__n_neighbors': 4, 'sfs__k_features': 2} test acc.: 0.9166666666666667\n", - "{'sfs__estimator__n_neighbors': 4, 'sfs__k_features': 3} test acc.: 0.95\n", - "{'sfs__estimator__n_neighbors': 7, 'sfs__k_features': 1} test acc.: 0.925\n", - "{'sfs__estimator__n_neighbors': 7, 'sfs__k_features': 2} test acc.: 0.9166666666666667\n", - "{'sfs__estimator__n_neighbors': 7, 'sfs__k_features': 3} test acc.: 0.95\n" - ] - } - ], "source": [ "for i in range(len(gs.cv_results_['params'])):\n", " print(gs.cv_results_['params'][i], 'test acc.:', gs.cv_results_['mean_test_score'][i])" @@ -1391,116 +1450,7 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Best parameters via GridSearch {'sfs__estimator__n_neighbors': 3, 'sfs__k_features': 3}\n" - ] - } - ], - "source": [ - "print(\"Best parameters via GridSearch\", gs.best_params_)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "(Note that in case of a tie, scikit-learn usually uses the parameter combination with the lowest index -- the one that comes first.)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Please note that it is currently not recommended to work with \"`refit=True`\" or use `gs.best_estimator_`. Instead, to adopt the best model after grid search, it is recommended to carry out this step manually via the following code:" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Pipeline(steps=[('sfs',\n", - " SequentialFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3),\n", - " k_features=3, scoring='accuracy')),\n", - " ('knn1', KNeighborsClassifier(n_neighbors=3))])" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipe.set_params(**gs.best_params_).fit(X_train, y_train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using a different estimator inside and outside SFS" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For additional flexibility in the pipeline, you may want to use a different estimator inside and outside SFS. In the example below, this is shown via the distinct `knn1` and `knn2` estimator objects:" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import GridSearchCV\n", - "from sklearn.pipeline import Pipeline\n", - "from mlxtend.feature_selection import SequentialFeatureSelector as SFS\n", - "import mlxtend\n", - "\n", - "knn1 = KNeighborsClassifier()\n", - "knn2 = KNeighborsClassifier()\n", - "\n", - "sfs1 = SFS(estimator=knn1, \n", - " k_features=3,\n", - " forward=True, \n", - " floating=False, \n", - " scoring='accuracy',\n", - " cv=5)\n", - "\n", - "pipe = Pipeline([('sfs', sfs1), \n", - " ('knn2', knn2)])\n", - "\n", - "param_grid = {\n", - " 'sfs__k_features': [1, 2, 3],\n", - " 'sfs__estimator__n_neighbors': [3, 4, 7], # inner knn\n", - " 'knn2__n_neighbors': [3, 4, 7] # outer knn\n", - " }\n", - " \n", - "gs = GridSearchCV(estimator=pipe, \n", - " param_grid=param_grid, \n", - " scoring='accuracy', \n", - " n_jobs=1, \n", - " cv=5,\n", - " refit=False)\n", - "\n", - "# run gridearch\n", - "gs = gs.fit(X_train, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1517,7 +1467,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1529,7 +1479,7 @@ " ('knn2', KNeighborsClassifier(n_neighbors=7))])" ] }, - "execution_count": 29, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1555,7 +1505,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -1564,7 +1514,7 @@ "(150, 4)" ] }, - "execution_count": 30, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1575,7 +1525,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -1656,7 +1606,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1694,16 +1644,16 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 33, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1722,7 +1672,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -1731,7 +1681,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1757,52 +1707,201 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ - "## Example 11 - Using Pandas DataFrames" + "## Example 11 - Interrupting Long Runs for Intermediate Results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Optionally, we can also use pandas DataFrames and pandas Series as input to the `fit` function. In this case, the column names of the pandas DataFrame will be used as feature names. However, note that if `custom_feature_names` are provided in the fit function, these `custom_feature_names` take precedence over the DataFrame column-based feature names." + "If your run is taking too long, it is possible to trigger a `KeyboardInterrupt` (e.g., ctrl+c on a Mac, or interrupting the cell in a Jupyter notebook) to obtain temporary results." ] }, { - "cell_type": "code", - "execution_count": 36, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "import pandas as pd\n", - "from sklearn.neighbors import KNeighborsClassifier\n", - "from sklearn.datasets import load_iris\n", - "from mlxtend.feature_selection import SequentialFeatureSelector as SFS\n", - "\n", - "\n", - "iris = load_iris()\n", - "X = iris.data\n", - "y = iris.target\n", - "knn = KNeighborsClassifier(n_neighbors=4)\n", - "\n", - "sfs1 = SFS(knn, \n", - " k_features=3, \n", - " forward=True, \n", - " floating=False, \n", - " scoring='accuracy',\n", - " cv=0)" + "**Toy dataset**" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 1, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sebastianraschka/miniforge3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.3\n", + " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n" + ] + } + ], + "source": [ + "from sklearn.datasets import make_classification\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "\n", + "X, y = make_classification(\n", + " n_samples=20000,\n", + " n_features=500,\n", + " n_informative=10,\n", + " n_redundant=40,\n", + " n_repeated=25,\n", + " n_clusters_per_class=5,\n", + " flip_y=0.05,\n", + " class_sep=0.5,\n", + " random_state=123,\n", + ")\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=123\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Long run with interruption**" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", + "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n", + "[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 7.8s finished\n", + "\n", + "[2022-09-13 21:10:39] Features: 1/10 -- score: 0.5965[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", + "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.2s remaining: 0.0s\n", + "[Parallel(n_jobs=1)]: Done 499 out of 499 | elapsed: 25.5s finished\n", + "\n", + "[2022-09-13 21:11:04] Features: 2/10 -- score: 0.6256875000000001[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", + "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.1s remaining: 0.0s\n", + "\n", + "STOPPING EARLY DUE TO KEYBOARD INTERRUPT..." + ] + } + ], + "source": [ + "from mlxtend.feature_selection import SequentialFeatureSelector as SFS\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "model = LogisticRegression()\n", + "\n", + "sfs1 = SFS(model, \n", + " k_features=10, \n", + " forward=True, \n", + " floating=False, \n", + " verbose=2,\n", + " scoring='accuracy',\n", + " cv=5)\n", + "\n", + "sfs1 = sfs1.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Finalizing the fit**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the feature selection run hasn't finished, so certain attributes may not be available. In order to use the SFS instance, it is recommended to call `finalize_fit`, which will make SFS estimator appear as \"fitted\" process the temporary results:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "sfs1.finalize_fit()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(128, 160)\n", + "0.6256875000000001\n" + ] + } + ], + "source": [ + "print(sfs1.k_feature_idx_)\n", + "print(sfs1.k_score_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 12 - Using Pandas DataFrames" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Optionally, we can also use pandas DataFrames and pandas Series as input to the `fit` function. In this case, the column names of the pandas DataFrame will be used as feature names. However, note that if `custom_feature_names` are provided in the fit function, these `custom_feature_names` take precedence over the DataFrame column-based feature names." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.datasets import load_iris\n", + "from mlxtend.feature_selection import SequentialFeatureSelector as SFS\n", + "\n", + "\n", + "iris = load_iris()\n", + "X = iris.data\n", + "y = iris.target\n", + "knn = KNeighborsClassifier(n_neighbors=4)\n", + "\n", + "sfs1 = SFS(knn, \n", + " k_features=3, \n", + " forward=True, \n", + " floating=False, \n", + " scoring='accuracy',\n", + " cv=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal lenpetal lensepal widpetal wid
05.13.51.40.2
14.93.01.40.2
24.73.21.30.2
34.63.11.50.2
45.03.61.40.2
\n", + "
" + ], + "text/plain": [ + " sepal len petal len sepal wid petal wid\n", + "0 5.1 3.5 1.4 0.2\n", + "1 4.9 3.0 1.4 0.2\n", + "2 4.7 3.2 1.3 0.2\n", + "3 4.6 3.1 1.5 0.2\n", + "4 5.0 3.6 1.4 0.2" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.datasets import load_iris\n", + "import pandas as pd\n", + "\n", + "iris = load_iris()\n", + "X = iris.data\n", + "y = iris.target\n", + "\n", + "X_df = pd.DataFrame(X, columns=['sepal len', 'petal len',\n", + " 'sepal wid', 'petal wid'])\n", + "X_df.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.neighbors import KNeighborsClassifier\n", + "from mlxtend.feature_selection import SequentialFeatureSelector as SFS\n", + "\n", + "knn = KNeighborsClassifier(n_neighbors=3)\n", + "\n", + "sfs1 = SFS(knn, \n", + " k_features=2, \n", + " scoring='accuracy',\n", + " feature_groups=(['sepal len', 'sepal wid'], ['petal len'], ['petal wid']),\n", + " cv=3)\n", + "\n", + "sfs1 = sfs1.fit(X_df, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "sfs1 = SFS(knn, \n", + " k_features=2, \n", + " scoring='accuracy',\n", + " feature_groups=[[0, 2], [1], [3]],\n", + " cv=3)\n", + "\n", + "sfs1 = sfs1.fit(X, y)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2248,7 +2504,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -2257,7 +2513,7 @@ "text": [ "## SequentialFeatureSelector\n", "\n", - "*SequentialFeatureSelector(estimator, k_features=1, forward=True, floating=False, verbose=0, scoring=None, cv=5, n_jobs=1, pre_dispatch='2*n_jobs', clone_estimator=True, fixed_features=None)*\n", + "*SequentialFeatureSelector(estimator, k_features=1, forward=True, floating=False, verbose=0, scoring=None, cv=5, n_jobs=1, pre_dispatch='2*n_jobs', clone_estimator=True, fixed_features=None, feature_groups=None)*\n", "\n", "Sequential Feature Selection for Classification and Regression.\n", "\n", @@ -2282,21 +2538,25 @@ " feature subset that is within one standard error of the\n", " cross-validation performance will be selected.\n", "\n", + "\n", "- `forward` : bool (default: True)\n", "\n", " Forward selection if True,\n", " backward selection otherwise\n", "\n", + "\n", "- `floating` : bool (default: False)\n", "\n", " Adds a conditional exclusion/inclusion if True.\n", "\n", + "\n", "- `verbose` : int (default: 0), level of verbosity to use in logging.\n", "\n", " If 0, no output,\n", " if 1 number of features in current set, if 2 detailed logging i\n", " ncluding timestamp and cv scores at step.\n", "\n", + "\n", "- `scoring` : str, callable, or None (default: None)\n", "\n", " If None (default), uses 'accuracy' for sklearn classifiers\n", @@ -2310,6 +2570,7 @@ " http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html\n", " for more information.\n", "\n", + "\n", "- `cv` : int (default: 5)\n", "\n", " Integer or iterable yielding train, test splits. If cv is an integer\n", @@ -2317,11 +2578,13 @@ " labels) stratified k-fold. Otherwise regular k-fold cross-validation\n", " is performed. No cross-validation if cv is None, False, or 0.\n", "\n", + "\n", "- `n_jobs` : int (default: 1)\n", "\n", " The number of CPUs to use for evaluating different feature subsets\n", " in parallel. -1 means 'all CPUs'.\n", "\n", + "\n", "- `pre_dispatch` : int, or string (default: '2*n_jobs')\n", "\n", " Controls the number of jobs that get dispatched\n", @@ -2336,6 +2599,7 @@ " A string, giving an expression as a function\n", " of n_jobs, as in `2*n_jobs`\n", "\n", + "\n", "- `clone_estimator` : bool (default: True)\n", "\n", " Clones estimator if True; works with the original estimator instance\n", @@ -2343,6 +2607,7 @@ " implement scikit-learn's set_params and get_params methods.\n", " In addition, it is required to set cv=0, and n_jobs=1.\n", "\n", + "\n", "- `fixed_features` : tuple (default: None)\n", "\n", " If not `None`, the feature indices provided as a tuple will be\n", @@ -2354,12 +2619,28 @@ " In other words, ensure that `k_features > len(fixed_features)`.\n", " New in mlxtend v. 0.18.0.\n", "\n", + "\n", + "- `feature_groups` : list or None (default: None)\n", + "\n", + " Optional argument for treating certain features as a group.\n", + " This means, the features within a group are always selected together,\n", + " never split.\n", + " For example, `feature_groups=[[1], [2], [3, 4, 5]]`\n", + " specifies 3 feature groups.In this case,\n", + " possible feature selection results with `k_features=2`\n", + " are `[[1], [2]`, `[[1], [3, 4, 5]]`, or `[[2], [3, 4, 5]]`.\n", + " Feature groups can be useful for\n", + " interpretability, for example, if features 3, 4, 5 are one-hot\n", + " encoded features. (For more details, please read the notes at the\n", + " bottom of this docstring). New in mlxtend v. 0.21.0.\n", + "\n", "**Attributes**\n", "\n", "- `k_feature_idx_` : array-like, shape = [n_predictions]\n", "\n", " Feature Indices of the selected feature subsets.\n", "\n", + "\n", "- `k_feature_names_` : array-like, shape = [n_predictions]\n", "\n", " Feature names of the selected feature subsets. If pandas\n", @@ -2368,15 +2649,19 @@ " feature names are string representation of the feature\n", " array indices. New in v 0.13.0.\n", "\n", + "\n", "- `k_score_` : float\n", "\n", " Cross validation average score of the selected subset.\n", "\n", + "\n", "- `subsets_` : dict\n", "\n", " A dictionary of selected feature subsets during the\n", " sequential selection, where the dictionary keys are\n", - " the lengths k of these feature subsets. The dictionary\n", + " the lengths k of these feature subsets. If the parameter\n", + " `feature_groups` is not None, the value of key indicates\n", + " the number of groups that are selected together. The dictionary\n", " values are dictionaries themselves with the following\n", " keys: 'feature_idx' (tuple of indices of the feature subset)\n", " 'feature_names' (tuple of feature names of the feat. subset)\n", @@ -2388,6 +2673,24 @@ " feature names are string representation of the feature\n", " array indices. The 'feature_names' is new in v 0.13.0.\n", "\n", + "**Notes**\n", + "\n", + "(1) If parameter `feature_groups` is not None, the\n", + " number of features is equal to the number of feature groups, i.e.\n", + " `len(feature_groups)`. For example, if `feature_groups = [[0], [1], [2, 3],\n", + " [4]]`, then the `max_features` value cannot exceed 4.\n", + "\n", + " (2) Although two or more individual features may be considered as one group\n", + " throughout the feature-selection process, it does not mean the individual\n", + " features of that group have the same impact on the outcome. For instance, in\n", + " linear regression, the coefficient of the feature 2 and 3 can be different\n", + " even if they are considered as one group in feature_groups.\n", + "\n", + " (3) If both fixed_features and feature_groups are specified, ensure that each\n", + " feature group contains the fixed_features selection. E.g., for a 3-feature set\n", + " fixed_features=[0, 1] and feature_groups=[[0, 1], [2]] is valid;\n", + " fixed_features=[0, 1] and feature_groups=[[0], [1, 2]] is not valid.\n", + "\n", "**Examples**\n", "\n", "For usage examples, please see\n", @@ -2566,6 +2869,13 @@ " s = f.read()\n", "print(s)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/docs/sources/user_guide/feature_selection/SequentialFeatureSelector_files/feature_groups.jpeg b/docs/sources/user_guide/feature_selection/SequentialFeatureSelector_files/feature_groups.jpeg new file mode 100644 index 000000000..c14fd4e79 Binary files /dev/null and b/docs/sources/user_guide/feature_selection/SequentialFeatureSelector_files/feature_groups.jpeg differ diff --git a/docs/sources/user_guide/feature_selection/SequentialFeatureSelector_files/feature_groups.key b/docs/sources/user_guide/feature_selection/SequentialFeatureSelector_files/feature_groups.key new file mode 100755 index 000000000..22ff52fc7 Binary files /dev/null and b/docs/sources/user_guide/feature_selection/SequentialFeatureSelector_files/feature_groups.key differ diff --git a/mlxtend/feature_selection/exhaustive_feature_selector.py b/mlxtend/feature_selection/exhaustive_feature_selector.py index 2869f8a59..300f14584 100644 --- a/mlxtend/feature_selection/exhaustive_feature_selector.py +++ b/mlxtend/feature_selection/exhaustive_feature_selector.py @@ -9,6 +9,7 @@ import operator as op import sys +import types from copy import deepcopy from functools import reduce from itertools import chain, combinations @@ -19,84 +20,9 @@ from joblib import Parallel, delayed from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone from sklearn.metrics import get_scorer -from sklearn.model_selection import cross_val_score from ..externals.name_estimators import _name_estimators - - -def _merge_lists(nested_list, high_level_indices=None): - """ - merge elements of lists (of a nested_list) into one single tuple with elements - sorted in ascending order. - - Parameters - ---------- - nested_list: List - a list whose elements must be list as well. - - high_level_indices: list or tuple, default None - a list or tuple that contains integers that are between 0 (inclusive) and - the length of `nested_lst` (exclusive). If None, the merge of all - lists nested in `nested_list` will be returned. - - Returns - ------- - out: tuple - a tuple, with elements sorted in ascending order, that is the merge of inner - lists whose indices are provided in `high_level_indices` - - Example: - nested_list = [[1],[2, 3],[4]] - high_level_indices = [1, 2] - >>> _merge_lists(nested_list, high_level_indices) - (2, 3, 4) # merging [2, 3] and [4] - """ - if high_level_indices is None: - high_level_indices = list(range(len(nested_list))) - - out = [] - for idx in high_level_indices: - out.extend(nested_list[idx]) - - return tuple(sorted(out)) - - -def _calc_score(selector, X, y, indices, groups=None, **fit_params): - if selector.cv: - scores = cross_val_score( - selector.est_, - X[:, indices], - y, - groups=groups, - cv=selector.cv, - scoring=selector.scorer, - n_jobs=1, - pre_dispatch=selector.pre_dispatch, - fit_params=fit_params, - ) - else: - selector.est_.fit(X[:, indices], y, **fit_params) - scores = np.array([selector.scorer(selector.est_, X[:, indices], y)]) - return indices, scores - - -def _get_featurenames(subsets_dict, feature_idx, X): - feature_names = None - if feature_idx is not None: - if hasattr(X, "loc"): - feature_names = tuple((X.columns[i] for i in feature_idx)) - else: - feature_names = tuple(str(i) for i in feature_idx) - - subsets_dict_ = deepcopy(subsets_dict) - for key in subsets_dict_: - if hasattr(X, "loc"): - new_tuple = tuple((X.columns[i] for i in subsets_dict[key]["feature_idx"])) - else: - new_tuple = tuple(str(i) for i in subsets_dict[key]["feature_idx"]) - subsets_dict_[key]["feature_names"] = new_tuple - - return subsets_dict_, feature_names +from .utilities import _calc_score, _get_featurenames, _merge_lists, _preprocess class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin): @@ -170,10 +96,16 @@ class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin): feature_groups : list or None (default: None) Optional argument for treating certain features as a group. - For example `[[1], [2], [3, 4, 5]]`, which can be useful for + This means, the features within a group are always selected together, + never split. + For example, `feature_groups=[[1], [2], [3, 4, 5]]` + specifies 3 feature groups.In this case, + possible feature selection results with `k_features=2` + are `[[1], [2]`, `[[1], [3, 4, 5]]`, or `[[2], [3, 4, 5]]`. + Feature groups can be useful for interpretability, for example, if features 3, 4, 5 are one-hot - encoded features. (for more details, please read the notes at the - bottom of this docstring). New in v 0.21.0. + encoded features. (For more details, please read the notes at the + bottom of this docstring). New in mlxtend v. 0.21.0. Attributes ---------- @@ -203,7 +135,7 @@ class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin): DataFrames are used in the `fit` method, the 'feature_names' correspond to the column names. Otherwise, the feature names are string representation of the feature - array indices. The 'feature_names' is new in v 0.13.0. + array indices. The 'feature_names' is new in v. 0.13.0. Notes ----- @@ -223,6 +155,10 @@ class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin): fixed_features=[0, 1] and feature_groups=[[0, 1], [2]] is valid; fixed_features=[0, 1] and feature_groups=[[0], [1, 2]] is not valid. + (4) In case of KeyboardInterrupt, the dictionary subsets may not be completed. + If user is still interested in getting the best score, they can use method + `finalize_fit`. + Examples ----------- For usage examples, please see @@ -248,20 +184,51 @@ def __init__( self.min_features = min_features self.max_features = max_features self.pre_dispatch = pre_dispatch - self.scoring = scoring - self.scorer = get_scorer(scoring) + # Want to raise meaningful error message if a + # cross-validation generator is inputted + if isinstance(cv, types.GeneratorType): + err_msg = ( + "Input cv is a generator object, which is not " + "supported. Instead please input an iterable yielding " + "train, test splits. This can usually be done by " + "passing a cross-validation generator to the " + "built-in list function. I.e. cv=list()" + ) + raise TypeError(err_msg) + self.cv = cv - self.print_progress = print_progress self.n_jobs = n_jobs - self.named_est = { - key: value for key, value in _name_estimators([self.estimator]) - } + self.print_progress = print_progress + self.clone_estimator = clone_estimator if self.clone_estimator: self.est_ = clone(self.estimator) else: self.est_ = self.estimator + self.scoring = scoring + if self.scoring is None: + if not hasattr(self.est_, "_estimator_type"): + raise AttributeError( + "Estimator must have an ._estimator_type for infering `scoring`" + ) + + if self.est_._estimator_type == "classifier": + self.scoring = "accuracy" + elif self.est_._estimator_type == "regressor": + self.scoring = "r2" + else: + raise AttributeError("Estimator must be a Classifier or Regressor.") + + if isinstance(self.scoring, str): + self.scorer = get_scorer(self.scoring) + else: + self.scorer = self.scoring + + self.named_est = { + key: value for key, value in _name_estimators([self.estimator]) + } + self.fixed_features = fixed_features self.feature_groups = feature_groups @@ -301,16 +268,12 @@ def fit(self, X, y, groups=None, **fit_params): self.subsets_ = {} self.fitted = False self.interrupted_ = False - self.feature_names = None self.best_idx_ = None self.best_feature_names_ = None self.best_score_ = None - if hasattr(X, "loc"): - X_ = X.values - self.feature_names = list(X.columns) - else: - X_ = X + X_, self.feature_names = _preprocess(X) + self.n_features = X_.shape[1] self.feature_names_to_idx_mapper = None if self.feature_names is not None: @@ -318,34 +281,35 @@ def fit(self, X, y, groups=None, **fit_params): name: idx for idx, name in enumerate(self.feature_names) } - if self.fixed_features is None: - self.fixed_features = tuple() + self.fixed_features_ = self.fixed_features + if self.fixed_features_ is None: + self.fixed_features_ = tuple() - fixed_feature_types = {type(i) for i in self.fixed_features} + fixed_feature_types = {type(i) for i in self.fixed_features_} if len(fixed_feature_types) > 1: raise ValueError( f"fixed_features values must have the same type. Found {fixed_feature_types}." ) - if len(self.fixed_features) > 0 and isinstance(self.fixed_features[0], str): + if len(self.fixed_features_) > 0 and isinstance(self.fixed_features_[0], str): if self.feature_names_to_idx_mapper is None: raise ValueError( "The input X does not contain name of features provived in" " `fixed_features`. Try passing input X as pandas DataFrames." ) - self.fixed_features = tuple( - self.feature_names_to_idx_mapper[name] for name in self.fixed_features + self.fixed_features_ = tuple( + self.feature_names_to_idx_mapper[name] for name in self.fixed_features_ ) - if not set(self.fixed_features).issubset(set(range(X_.shape[1]))): + if not set(self.fixed_features_).issubset(set(range(self.n_features))): raise ValueError( "`fixed_features` contains at least one feature that is not in the" " input data `X`." ) if self.feature_groups is None: - self.feature_groups = [[i] for i in range(X_.shape[1])] + self.feature_groups = [[i] for i in range(self.n_features)] for fg in self.feature_groups: if len(fg) == 0: @@ -375,10 +339,10 @@ def fit(self, X, y, groups=None, **fit_params): tmp = [self.feature_names_to_idx_mapper[name] for name in item] lst.append(tmp) - self.feature_groups[:] = lst + self.feature_groups = lst if sorted(_merge_lists(self.feature_groups)) != sorted( - list(range(X_.shape[1])) + list(range(self.n_features)) ): raise ValueError( "`feature_group` must contain all features within `range(X.shape[1])`" @@ -389,18 +353,18 @@ def fit(self, X, y, groups=None, **fit_params): # label-encoding fixed_features according to the groups in `feature_groups` # and replace each individual feature in `fixed_features` with their correspondig # group id - features_encoded_by_groupID = np.full(X_.shape[1], -1, dtype=np.int64) + features_encoded_by_groupID = np.full(self.n_features, -1, dtype=np.int64) for id, group in enumerate(self.feature_groups): for idx in group: features_encoded_by_groupID[idx] = id - lst = [features_encoded_by_groupID[idx] for idx in self.fixed_features] + lst = [features_encoded_by_groupID[idx] for idx in self.fixed_features_] self.fixed_features_group_set = set(lst) n_fixed_features_expected = sum( len(self.feature_groups[id]) for id in self.fixed_features_group_set ) - if n_fixed_features_expected != len(self.fixed_features): + if n_fixed_features_expected != len(self.fixed_features_): raise ValueError( "For each feature specified in the `fixed feature`, its group-mates" "must be specified as `fix_features` as well when `feature_groups`" @@ -482,11 +446,9 @@ def ncr(n, r): self, X_, y, - _merge_lists( - self.feature_groups, - list(set(c).union(self.fixed_features_group_set)), - ), + list(set(c).union(self.fixed_features_group_set)), groups=groups, + feature_groups=self.feature_groups, **fit_params, ) for c in candidates @@ -494,9 +456,9 @@ def ncr(n, r): ) try: - for iteration, (c, cv_scores) in work: + for iteration, (indices, cv_scores) in work: self.subsets_[iteration] = { - "feature_idx": c, + "feature_idx": _merge_lists(self.feature_groups, indices), "cv_scores": cv_scores, "avg_score": np.mean(cv_scores), } @@ -505,31 +467,36 @@ def ncr(n, r): sys.stderr.write("\rFeatures: %d/%d" % (iteration + 1, all_comb)) sys.stderr.flush() - if self._TESTING_INTERRUPT_MODE: - self.subsets_, self.best_feature_names_ = _get_featurenames( - self.subsets_, self.best_idx_, X - ) + if self._TESTING_INTERRUPT_MODE: # this is just for testing + self.finalize_fit() raise KeyboardInterrupt except KeyboardInterrupt: self.interrupted_ = True sys.stderr.write("\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...") - max_score = float("-inf") + if self.interrupted_: + self.fitted = False + else: + self.fitted = True # the completion of sequential selection process. + self.finalize_fit() + + return self + + def finalize_fit(self): + max_score = np.NINF for c in self.subsets_: if self.subsets_[c]["avg_score"] > max_score: - max_score = self.subsets_[c]["avg_score"] best_subset = c - score = max_score - idx = self.subsets_[best_subset]["feature_idx"] + max_score = self.subsets_[c]["avg_score"] - self.best_idx_ = idx - self.best_score_ = score - self.fitted = True + self.best_idx_ = self.subsets_[best_subset]["feature_idx"] + self.best_score_ = max_score self.subsets_, self.best_feature_names_ = _get_featurenames( - self.subsets_, self.best_idx_, X + self.subsets_, self.best_idx_, self.feature_names, self.n_features ) - return self + + return def transform(self, X): """Return the best selected features from X. @@ -548,10 +515,7 @@ def transform(self, X): """ self._check_fitted() - if hasattr(X, "loc"): - X_ = X.values - else: - X_ = X + X_, _ = _preprocess(X) return X_[:, self.best_idx_] def fit_transform(self, X, y, groups=None, **fit_params): diff --git a/mlxtend/feature_selection/sequential_feature_selector.py b/mlxtend/feature_selection/sequential_feature_selector.py index e8a64a770..304206dec 100644 --- a/mlxtend/feature_selection/sequential_feature_selector.py +++ b/mlxtend/feature_selection/sequential_feature_selector.py @@ -18,54 +18,10 @@ from joblib import Parallel, delayed from sklearn.base import MetaEstimatorMixin, clone from sklearn.metrics import get_scorer -from sklearn.model_selection import cross_val_score from ..externals.name_estimators import _name_estimators from ..utils.base_compostion import _BaseXComposition - - -def _calc_score(selector, X, y, indices, groups=None, **fit_params): - if selector.cv: - scores = cross_val_score( - selector.est_, - X, - y, - groups=groups, - cv=selector.cv, - scoring=selector.scorer, - n_jobs=1, - pre_dispatch=selector.pre_dispatch, - fit_params=fit_params, - ) - else: - selector.est_.fit(X, y, **fit_params) - scores = np.array([selector.scorer(selector.est_, X, y)]) - return indices, scores - - -def _get_featurenames(subsets_dict, feature_idx, custom_feature_names, X): - feature_names = None - if feature_idx is not None: - if custom_feature_names is not None: - feature_names = tuple((custom_feature_names[i] for i in feature_idx)) - elif hasattr(X, "loc"): - feature_names = tuple((X.columns[i] for i in feature_idx)) - else: - feature_names = tuple(str(i) for i in feature_idx) - - subsets_dict_ = deepcopy(subsets_dict) - for key in subsets_dict_: - if custom_feature_names is not None: - new_tuple = tuple( - (custom_feature_names[i] for i in subsets_dict[key]["feature_idx"]) - ) - elif hasattr(X, "loc"): - new_tuple = tuple((X.columns[i] for i in subsets_dict[key]["feature_idx"])) - else: - new_tuple = tuple(str(i) for i in subsets_dict[key]["feature_idx"]) - subsets_dict_[key]["feature_names"] = new_tuple - - return subsets_dict_, feature_names +from .utilities import _calc_score, _get_featurenames, _merge_lists, _preprocess class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin): @@ -89,15 +45,19 @@ class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin): If "parsimonious" is provided as an argument, the smallest feature subset that is within one standard error of the cross-validation performance will be selected. + forward : bool (default: True) Forward selection if True, backward selection otherwise + floating : bool (default: False) Adds a conditional exclusion/inclusion if True. + verbose : int (default: 0), level of verbosity to use in logging. If 0, no output, if 1 number of features in current set, if 2 detailed logging i ncluding timestamp and cv scores at step. + scoring : str, callable, or None (default: None) If None (default), uses 'accuracy' for sklearn classifiers and 'r2' for sklearn regressors. @@ -109,14 +69,17 @@ class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin): sklearn's signature ``scorer(estimator, X, y)``; see http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html for more information. + cv : int (default: 5) Integer or iterable yielding train, test splits. If cv is an integer and `estimator` is a classifier (or y consists of integer class labels) stratified k-fold. Otherwise regular k-fold cross-validation is performed. No cross-validation if cv is None, False, or 0. + n_jobs : int (default: 1) The number of CPUs to use for evaluating different feature subsets in parallel. -1 means 'all CPUs'. + pre_dispatch : int, or string (default: '2*n_jobs') Controls the number of jobs that get dispatched during parallel execution if `n_jobs > 1` or `n_jobs=-1`. @@ -129,11 +92,13 @@ class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin): An int, giving the exact number of total jobs that are spawned A string, giving an expression as a function of n_jobs, as in `2*n_jobs` + clone_estimator : bool (default: True) Clones estimator if True; works with the original estimator instance if False. Set to False if the estimator doesn't implement scikit-learn's set_params and get_params methods. In addition, it is required to set cv=0, and n_jobs=1. + fixed_features : tuple (default: None) If not `None`, the feature indices provided as a tuple will be regarded as fixed by the feature selector. For example, if @@ -144,22 +109,40 @@ class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin): In other words, ensure that `k_features > len(fixed_features)`. New in mlxtend v. 0.18.0. + feature_groups : list or None (default: None) + Optional argument for treating certain features as a group. + This means, the features within a group are always selected together, + never split. + For example, `feature_groups=[[1], [2], [3, 4, 5]]` + specifies 3 feature groups. In this case, + possible feature selection results with `k_features=2` + are `[[1], [2]`, `[[1], [3, 4, 5]]`, or `[[2], [3, 4, 5]]`. + Feature groups can be useful for + interpretability, for example, if features 3, 4, 5 are one-hot + encoded features. (For more details, please read the notes at the + bottom of this docstring). New in mlxtend v. 0.21.0. + Attributes ---------- k_feature_idx_ : array-like, shape = [n_predictions] Feature Indices of the selected feature subsets. + k_feature_names_ : array-like, shape = [n_predictions] Feature names of the selected feature subsets. If pandas DataFrames are used in the `fit` method, the feature names correspond to the column names. Otherwise, the feature names are string representation of the feature array indices. New in v 0.13.0. + k_score_ : float Cross validation average score of the selected subset. + subsets_ : dict A dictionary of selected feature subsets during the sequential selection, where the dictionary keys are - the lengths k of these feature subsets. The dictionary + the lengths k of these feature subsets. If the parameter + `feature_groups` is not None, the value of key indicates + the number of groups that are selected together. The dictionary values are dictionaries themselves with the following keys: 'feature_idx' (tuple of indices of the feature subset) 'feature_names' (tuple of feature names of the feat. subset) @@ -171,6 +154,28 @@ class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin): feature names are string representation of the feature array indices. The 'feature_names' is new in v 0.13.0. + Notes + ----- + (1) If parameter `feature_groups` is not None, the + number of features is equal to the number of feature groups, i.e. + `len(feature_groups)`. For example, if `feature_groups = [[0], [1], [2, 3], + [4]]`, then the `max_features` value cannot exceed 4. + + (2) Although two or more individual features may be considered as one group + throughout the feature-selection process, it does not mean the individual + features of that group have the same impact on the outcome. For instance, in + linear regression, the coefficient of the feature 2 and 3 can be different + even if they are considered as one group in feature_groups. + + (3) If both fixed_features and feature_groups are specified, ensure that each + feature group contains the fixed_features selection. E.g., for a 3-feature set + fixed_features=[0, 1] and feature_groups=[[0, 1], [2]] is valid; + fixed_features=[0, 1] and feature_groups=[[0], [1, 2]] is not valid. + + (4) In case of KeyboardInterrupt, the dictionary subsets may not be completed. + If user is still interested in getting the best score, they can use method + `finalize_fit`. + Examples ----------- For usage examples, please see @@ -191,6 +196,7 @@ def __init__( pre_dispatch="2*n_jobs", clone_estimator=True, fixed_features=None, + feature_groups=None, ): self.estimator = estimator @@ -212,54 +218,36 @@ def __init__( self.cv = cv self.n_jobs = n_jobs self.verbose = verbose - self.clone_estimator = clone_estimator - - if fixed_features is not None: - if isinstance(self.k_features, int) and self.k_features <= len( - fixed_features - ): - raise ValueError( - "Number of features to be selected must" - " be larger than the number of" - " features specified via `fixed_features`." - " Got `k_features=%d` and" - " `fixed_features=%d`" % (k_features, len(fixed_features)) - ) - - elif isinstance(self.k_features, tuple) and self.k_features[0] <= len( - fixed_features - ): - raise ValueError( - "The minimum number of features to" - " be selected must" - " be larger than the number of" - " features specified via `fixed_features`." - " Got `k_features=%s` and " - "`len(fixed_features)=%d`" % (k_features, len(fixed_features)) - ) - - self.fixed_features = fixed_features + self.clone_estimator = clone_estimator if self.clone_estimator: self.est_ = clone(self.estimator) else: self.est_ = self.estimator + self.scoring = scoring + if self.scoring is None: + if not hasattr(self.est_, "_estimator_type"): + raise AttributeError( + "Estimator must have an ._estimator_type for infering `scoring`" + ) - if scoring is None: if self.est_._estimator_type == "classifier": - scoring = "accuracy" + self.scoring = "accuracy" elif self.est_._estimator_type == "regressor": - scoring = "r2" + self.scoring = "r2" else: - raise AttributeError("Estimator must " "be a Classifier or Regressor.") - if isinstance(scoring, str): - self.scorer = get_scorer(scoring) + raise AttributeError("Estimator must be a Classifier or Regressor.") + + if isinstance(self.scoring, str): + self.scorer = get_scorer(self.scoring) else: - self.scorer = scoring + self.scorer = self.scoring + + self.fixed_features = fixed_features + self.feature_groups = feature_groups self.fitted = False - self.subsets_ = {} self.interrupted_ = False # don't mess with this unless testing @@ -291,7 +279,31 @@ def set_params(self, **params): self._set_params("estimator", "named_estimators", **params) return self - def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params): + def generate_error_message_k_features(self, name): + if ( + len(self.fixed_features_) == 0 + and len(self.feature_groups_) == self.n_features + ): + err_msg = f"{name} must be between 1 and X.shape[1]." + + elif ( + len(self.fixed_features_) > 0 + and len(self.feature_groups_) == self.n_features + ): + err_msg = f"{name} must be between len(fixed_features) and X.shape[1]." + + elif ( + len(self.fixed_features_) == 0 + and len(self.feature_groups_) < self.n_features + ): + err_msg = f"{name} must be between 1 and len(feature_groups)." + + else: # both fixed_features and feature_groups are provided + err_msg = f"{name} must be between the number of groups that appear in fixed_features and len(feature_groups)." + + return err_msg + + def fit(self, X, y, groups=None, **fit_params): """Perform feature selection and learn model from training data. Parameters @@ -305,10 +317,6 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params): Target values. New in v 0.13.0: pandas DataFrames are now also accepted as argument for y. - custom_feature_names : None or tuple (default: tuple) - Custom feature names for `self.k_feature_names` and - `self.subsets_[i]['feature_names']`. - (new in v 0.13.0) groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. Passed to the fit method of the cross-validator. @@ -330,29 +338,107 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params): self.k_feature_names_ = None self.k_score_ = None + X_, self.feature_names = _preprocess(X) + self.n_features = X_.shape[1] + + self.feature_names_to_idx_mapper = None + if self.feature_names is not None: + self.feature_names_to_idx_mapper = { + name: idx for idx, name in enumerate(self.feature_names) + } + self.fixed_features_ = self.fixed_features - self.fixed_features_set_ = set() - - if hasattr(X, "loc"): - X_ = X.values - if self.fixed_features is not None: - self.fixed_features_ = tuple( - X.columns.get_loc(c) if isinstance(c, str) else c - for c in self.fixed_features + if self.fixed_features_ is None: + self.fixed_features_ = tuple() + + fixed_feature_types = {type(i) for i in self.fixed_features_} + if len(fixed_feature_types) > 1: + raise ValueError( + f"fixed_features values must have the same type. Found {fixed_feature_types}." + ) + + if len(self.fixed_features_) > 0 and isinstance(self.fixed_features_[0], str): + if self.feature_names_to_idx_mapper is None: + raise ValueError( + "The input X does not contain name of features provived in" + " `fixed_features`. Try passing input X as pandas DataFrames." ) - else: - X_ = X - if self.fixed_features is not None: - self.fixed_features_set_ = set(self.fixed_features_) + self.fixed_features_ = tuple( + self.feature_names_to_idx_mapper[name] for name in self.fixed_features_ + ) + + if not set(self.fixed_features_).issubset(set(range(X_.shape[1]))): + raise ValueError( + "`fixed_features` contains at least one feature that is not in the" + " input data `X`." + ) + + self.feature_groups_ = self.feature_groups + if self.feature_groups_ is None: + self.feature_groups_ = [[i] for i in range(X_.shape[1])] + + for fg in self.feature_groups_: + if len(fg) == 0: + raise ValueError( + "Each list in the nested lists `features_group` cannot be empty" + ) - if custom_feature_names is not None and len(custom_feature_names) != X.shape[1]: + feature_group_types = { + type(i) for sublist in self.feature_groups_ for i in sublist + } + if len(feature_group_types) > 1: raise ValueError( - "If custom_feature_names is not None, " - "the number of elements in custom_feature_names " - "must equal the number of columns in X." + f"feature_group values must have the same type. Found {feature_group_types}." ) + if isinstance(self.feature_groups_[0][0], str): + if self.feature_names_to_idx_mapper is None: + raise ValueError( + "The input X does not contain name of features provived in" + " `feature_groups`. Try passing input X as pandas DataFrames" + " in which the name of features match the ones provided in" + " `feature_groups`" + ) + + lst = [] + for item in self.feature_groups_: + tmp = [self.feature_names_to_idx_mapper[name] for name in item] + lst.append(tmp) + + self.feature_groups_ = lst + + if sorted(_merge_lists(self.feature_groups_)) != sorted( + list(range(X_.shape[1])) + ): + raise ValueError( + "`feature_group` must contain all features within `range(X.shape[1])`" + " and there should be no common feature betweeen any two distinct" + " group of features provided in `feature_group`" + ) + + features_encoded_by_groupID = np.full(X_.shape[1], -1, dtype=np.int64) + for group_id, group in enumerate(self.feature_groups_): + for idx in group: + features_encoded_by_groupID[idx] = group_id + + lst = [features_encoded_by_groupID[idx] for idx in self.fixed_features_] + self.fixed_features_group_set = set(lst) + + n_fixed_features_expected = sum( + len(self.feature_groups_[group_id]) + for group_id in self.fixed_features_group_set + ) + if n_fixed_features_expected != len(self.fixed_features_): + raise ValueError( + "For each feature specified in the `fixed feature`, its group-mates" + "must be specified as `fix_features` as well when `feature_groups`" + "is provided." + ) + + self.k_lb = max(1, len(self.fixed_features_group_set)) + self.k_ub = len(self.feature_groups_) + if ( not isinstance(self.k_features, int) and not isinstance(self.k_features, tuple) @@ -362,29 +448,18 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params): "k_features must be a positive integer" ", tuple, or string" ) + eligible_k_values_range = range(self.k_lb, self.k_ub + 1) if isinstance(self.k_features, int) and ( - self.k_features < 1 or self.k_features > X_.shape[1] + self.k_features not in eligible_k_values_range ): - raise AttributeError( - "k_features must be a positive integer" - " between 1 and X.shape[1], got %s" % (self.k_features,) - ) + err_msg = self.generate_error_message_k_features("k_features") + raise AttributeError(err_msg) if isinstance(self.k_features, tuple): if len(self.k_features) != 2: raise AttributeError( "k_features tuple must consist of 2" - " elements a min and a max value." - ) - - if self.k_features[0] not in range(1, X_.shape[1] + 1): - raise AttributeError( - "k_features tuple min value must be in" " range(1, X.shape[1]+1)." - ) - - if self.k_features[1] not in range(1, X_.shape[1] + 1): - raise AttributeError( - "k_features tuple max value must be in" " range(1, X.shape[1]+1)." + " elements, a min and a max value." ) if self.k_features[0] > self.k_features[1]: @@ -393,166 +468,94 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params): " than the max k_features value." ) - if isinstance(self.k_features, tuple) or isinstance(self.k_features, str): - - select_in_range = True + if self.k_features[0] not in eligible_k_values_range: + err_msg = self.generate_error_message_k_features( + "k_features tuple min value" + ) + raise AttributeError(err_msg) - if isinstance(self.k_features, str): - if self.k_features not in {"best", "parsimonious"}: - raise AttributeError( - "If a string argument is provided, " - 'it must be "best" or "parsimonious"' - ) - else: - min_k = 1 - max_k = X_.shape[1] - else: - min_k = self.k_features[0] - max_k = self.k_features[1] + # raise AttributeError( + # "k_features tuple min value must be in" " range(1, X.shape[1]+1)." + # ) - else: - select_in_range = False - k_to_select = self.k_features + if self.k_features[1] not in eligible_k_values_range: + err_msg = self.generate_error_message_k_features( + "k_features tuple max value" + ) + raise AttributeError(err_msg) - orig_set = set(range(X_.shape[1])) - n_features = X_.shape[1] + # raise AttributeError( + # "k_features tuple max value must be in" " range(1, X.shape[1]+1)." + # ) - if self.forward and self.fixed_features is not None: - orig_set = set(range(X_.shape[1])) - self.fixed_features_set_ - n_features = len(orig_set) + self.is_parsimonious = False + if isinstance(self.k_features, str): + if self.k_features not in {"best", "parsimonious"}: + raise AttributeError( + "If a string argument is provided, " + 'it must be "best" or "parsimonious"' + ) + if self.k_features == "parsimonious": + self.is_parsimonious = True - if self.forward: - if select_in_range: - k_to_select = max_k + if isinstance(self.k_features, str): + self.k_features = (self.k_lb, self.k_ub) + elif isinstance(self.k_features, int): + # we treat k_features as k group of features + self.k_features = (self.k_features, self.k_features) - if self.fixed_features is not None: - k_idx = self.fixed_features_ - k = len(k_idx) - k_idx, k_score = _calc_score( - self, X_[:, k_idx], y, k_idx, groups=groups, **fit_params - ) - self.subsets_[k] = { - "feature_idx": k_idx, - "cv_scores": k_score, - "avg_score": np.nanmean(k_score), - } + self.min_k = self.k_features[0] + self.max_k = self.k_features[1] - else: - k_idx = () - k = 0 + if self.forward: + k_idx = tuple(sorted(self.fixed_features_group_set)) + k_stop = self.max_k else: - if select_in_range: - k_to_select = min_k - k_idx = tuple(orig_set) - k = len(k_idx) + k_idx = tuple(range(self.k_ub)) + k_stop = self.min_k + + k = len(k_idx) + if k > 0: k_idx, k_score = _calc_score( - self, X_[:, k_idx], y, k_idx, groups=groups, **fit_params + self, + X_, + y, + k_idx, + groups=groups, + feature_groups=self.feature_groups_, + **fit_params, ) self.subsets_[k] = { "feature_idx": k_idx, "cv_scores": k_score, "avg_score": np.nanmean(k_score), } - best_subset = None - k_score = 0 + orig_set = set(range(self.k_ub)) try: - while k != k_to_select: + while k != k_stop: prev_subset = set(k_idx) - if self.forward: - k_idx, k_score, cv_scores = self._inclusion( - orig_set=orig_set, - subset=prev_subset, - X=X_, - y=y, - groups=groups, - **fit_params - ) + search_set = orig_set + must_include_set = prev_subset else: - k_idx, k_score, cv_scores = self._exclusion( - feature_set=prev_subset, - X=X_, - y=y, - groups=groups, - fixed_feature=self.fixed_features_set_, - **fit_params - ) - - if self.floating: - - if self.forward: - continuation_cond_1 = len(k_idx) - else: - continuation_cond_1 = n_features - len(k_idx) - - continuation_cond_2 = True - ran_step_1 = True - new_feature = None - - while continuation_cond_1 >= 2 and continuation_cond_2: - k_score_c = None - - if ran_step_1: - (new_feature,) = set(k_idx) ^ prev_subset - - if self.forward: - - fixed_features_ok = True - if ( - self.fixed_features is not None - and len(self.fixed_features) - len(k_idx) <= 1 - ): - fixed_features_ok = False - if fixed_features_ok: - k_idx_c, k_score_c, cv_scores_c = self._exclusion( - feature_set=k_idx, - fixed_feature=( - {new_feature} | self.fixed_features_set_ - ), - X=X_, - y=y, - groups=groups, - **fit_params - ) - - else: - k_idx_c, k_score_c, cv_scores_c = self._inclusion( - orig_set=orig_set - {new_feature}, - subset=set(k_idx), - X=X_, - y=y, - groups=groups, - **fit_params - ) - - if k_score_c is not None and k_score_c > k_score: - - if len(k_idx_c) in self.subsets_: - cached_score = self.subsets_[len(k_idx_c)]["avg_score"] - else: - cached_score = None - - if cached_score is None or k_score_c > cached_score: - prev_subset = set(k_idx) - k_idx, k_score, cv_scores = ( - k_idx_c, - k_score_c, - cv_scores_c, - ) - continuation_cond_1 = len(k_idx) - ran_step_1 = False - - else: - continuation_cond_2 = False - - else: - continuation_cond_2 = False + search_set = prev_subset + must_include_set = self.fixed_features_group_set + + k_idx, k_score, cv_scores = self._feature_selector( + search_set, + must_include_set, + X=X_, + y=y, + is_forward=self.forward, + groups=groups, + feature_groups=self.feature_groups_, + **fit_params, + ) k = len(k_idx) # floating can lead to multiple same-sized subsets if k not in self.subsets_ or (k_score > self.subsets_[k]["avg_score"]): - k_idx = tuple(sorted(k_idx)) self.subsets_[k] = { "feature_idx": k_idx, @@ -560,8 +563,63 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params): "avg_score": k_score, } + if self.floating: + # floating direction is opposite of self.forward, i.e. in + # forward selection, we do floating in backward manner, + # and in backward selection, we do floating in forward manner + is_float_forward = not self.forward + (new_feature_idx,) = set(k_idx) ^ prev_subset + for _ in range(X_.shape[1]): + if ( + self.forward + and (len(k_idx) - len(self.fixed_features_group_set)) <= 2 + ): + break + if not self.forward and (len(orig_set) - len(k_idx) <= 2): + break + + if is_float_forward: + # corresponding to self.forward=False + search_set = orig_set - {new_feature_idx} + must_include_set = set(k_idx) + else: + # corresponding to self.forward=True + search_set = set(k_idx) + must_include_set = self.fixed_features_group_set | { + new_feature_idx + } + + (k_idx_c, k_score_c, cv_scores_c,) = self._feature_selector( + search_set, + must_include_set, + X=X_, + y=y, + is_forward=is_float_forward, + groups=groups, + feature_groups=self.feature_groups_, + **fit_params, + ) + + if k_score_c <= k_score: + break + + # In the floating process, we basically revisit our previous + # steps. so, len(k_idx_c) definitely exists as a key in + # the dictionary `self.subsets_` + if k_score_c <= self.subsets_[len(k_idx_c)]["avg_score"]: + break + else: + k_idx, k_score, cv_scores = k_idx_c, k_score_c, cv_scores_c + k_idx = tuple(sorted(k_idx)) + k = len(k_idx) + self.subsets_[k] = { + "feature_idx": k_idx, + "cv_scores": cv_scores, + "avg_score": k_score, + } + if self.verbose == 1: - sys.stderr.write("\rFeatures: %d/%s" % (len(k_idx), k_to_select)) + sys.stderr.write("\rFeatures: %d/%s" % (len(k_idx), k_stop)) sys.stderr.flush() elif self.verbose > 1: sys.stderr.write( @@ -569,121 +627,179 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params): % ( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), len(k_idx), - k_to_select, + k_stop, k_score, ) ) - if self._TESTING_INTERRUPT_MODE: - self.subsets_, self.k_feature_names_ = _get_featurenames( - self.subsets_, self.k_feature_idx_, custom_feature_names, X - ) + if self._TESTING_INTERRUPT_MODE: # just to test `KeyboardInterrupt` + self.finalize_fit() raise KeyboardInterrupt except KeyboardInterrupt: self.interrupted_ = True sys.stderr.write("\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...") - if select_in_range: - max_score = float("-inf") + if self.interrupted_: + self.fitted = False + else: + self.fitted = True # the completion of sequential selection process. + self.finalize_fit() + + return self + + def finalize_fit(self): + max_score = np.NINF + for k in self.subsets_: + if ( + k >= self.min_k + and k <= self.max_k + and self.subsets_[k]["avg_score"] > max_score + ): + max_score = self.subsets_[k]["avg_score"] + best_subset = k + + k_score = max_score + if k_score == np.NINF: + # i.e. all keys of self.subsets_ are not in interval `[self.min_k, self.max_k]` + # this happens if KeyboardInterrupt happens + keys = list(self.subsets_.keys()) + scores = [self.subsets_[k]["avg_score"] for k in keys] + arg = np.argmax(scores) - max_score = float("-inf") + k_score = scores[arg] + best_subset = keys[arg] + + k_idx = self.subsets_[best_subset]["feature_idx"] + + if self.is_parsimonious: for k in self.subsets_: - if k < min_k or k > max_k: + if k >= best_subset: continue - if self.subsets_[k]["avg_score"] > max_score: + if self.subsets_[k]["avg_score"] >= ( + max_score + - np.std(self.subsets_[k]["cv_scores"]) + / self.subsets_[k]["cv_scores"].shape[0] + ): max_score = self.subsets_[k]["avg_score"] best_subset = k + k_score = max_score k_idx = self.subsets_[best_subset]["feature_idx"] - if self.k_features == "parsimonious": - for k in self.subsets_: - if k >= best_subset: - continue - if self.subsets_[k]["avg_score"] >= ( - max_score - - np.std(self.subsets_[k]["cv_scores"]) - / self.subsets_[k]["cv_scores"].shape[0] - ): - max_score = self.subsets_[k]["avg_score"] - best_subset = k - k_score = max_score - k_idx = self.subsets_[best_subset]["feature_idx"] - - self.k_feature_idx_ = k_idx + for k in self.subsets_: + self.subsets_[k]["feature_idx"] = _merge_lists( + self.feature_groups_, self.subsets_[k]["feature_idx"] + ) + self.k_feature_idx_ = _merge_lists(self.feature_groups_, k_idx) self.k_score_ = k_score - self.fitted = True self.subsets_, self.k_feature_names_ = _get_featurenames( - self.subsets_, self.k_feature_idx_, custom_feature_names, X + self.subsets_, self.k_feature_idx_, self.feature_names, self.n_features ) - return self - def _inclusion( - self, orig_set, subset, X, y, ignore_feature=None, groups=None, **fit_params + return + + def _feature_selector( + self, + search_set, + must_include_set, + X, + y, + is_forward, + groups=None, + feature_groups=None, + **fit_params, ): - all_avg_scores = [] - all_cv_scores = [] - all_subsets = [] - res = (None, None, None) - remaining = orig_set - subset - if remaining: - features = len(remaining) - n_jobs = min(self.n_jobs, features) + """Perform one round of feature selection. When `is_forward=True`, it is + a forward selection that searches the `search_set` to find one feature that + with `must_include_set` results in highest average score. When + `is_forward=False`, it is a backward selection that searches the `search_set` + for a feature that its exclusion results in a set of features that includes + `must_include_set` and has the highest averege score. + + Parameters + ---------- + self : object + an instance of class `SequentialFeatureSelector` + + search_set : set + a set of features through which a feature must be selected to be included + (when `is_forward=True`) or to be excluded (when `is_forward=False`) + + must_include_set : set + a set of features that must be present in the selected subset of features + + X : numpy.ndarray + a 2D numpy array. Each row corresponds to one observation and each + column corresponds to one feature. + + y : numpy.ndarray + the target variable + + is_forward : bool + True if it is forward selection. False if it is backward selection + + groups : array-like, with shape (n_samples,), optional + Group labels for the samples used while splitting the dataset into + train/test set. Passed to the fit method of the cross-validator. + + feature_groups : list or None (default: None) + Optional argument for treating certain features as a group. + + fit_params : various, optional + Additional parameters that are being passed to the estimator. + For example, `sample_weights=weights`. + + Returns + ------- + out1 : the selected set of features that has the highest mean of cv scores + out2 : the mean of cv scores for the selected set of features. + out3 : all cv scores for the selected set of features + """ + out = (None, None, None) + + if feature_groups is None: + feature_groups = [[i] for i in range(X.shape[1])] + + remaining_set = search_set - must_include_set + remaining = list(remaining_set) + n = len(remaining) + if n > 0: + if is_forward: + feature_explorer = combinations(remaining, r=1) + else: + feature_explorer = combinations(remaining, r=n - 1) + + n_jobs = min(self.n_jobs, n) parallel = Parallel( n_jobs=n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch ) work = parallel( delayed(_calc_score)( self, - X[:, tuple(subset | {feature})], + X, y, - tuple(subset | {feature}), + tuple(set(p) | must_include_set), groups=groups, - **fit_params + feature_groups=feature_groups, + **fit_params, ) - for feature in remaining - if feature != ignore_feature + for p in feature_explorer ) - for new_subset, cv_scores in work: - all_avg_scores.append(np.nanmean(cv_scores)) - all_cv_scores.append(cv_scores) - all_subsets.append(new_subset) - - best = np.argmax(all_avg_scores) - res = (all_subsets[best], all_avg_scores[best], all_cv_scores[best]) - return res - - def _exclusion( - self, feature_set, X, y, fixed_feature=None, groups=None, **fit_params - ): - n = len(feature_set) - res = (None, None, None) - if n > 1: all_avg_scores = [] all_cv_scores = [] all_subsets = [] - features = n - n_jobs = min(self.n_jobs, features) - parallel = Parallel( - n_jobs=n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch - ) - work = parallel( - delayed(_calc_score)(self, X[:, p], y, p, groups=groups, **fit_params) - for p in combinations(feature_set, r=n - 1) - if not fixed_feature or fixed_feature.issubset(set(p)) - ) - - for p, cv_scores in work: - + for new_subset, cv_scores in work: all_avg_scores.append(np.nanmean(cv_scores)) all_cv_scores.append(cv_scores) - all_subsets.append(p) + all_subsets.append(new_subset) - best = np.argmax(all_avg_scores) - res = (all_subsets[best], all_avg_scores[best], all_cv_scores[best]) - return res + if len(all_avg_scores) > 0: + best = np.argmax(all_avg_scores) + out = (all_subsets[best], all_avg_scores[best], all_cv_scores[best]) + + return out def transform(self, X): """Reduce X to its most important features. @@ -702,10 +818,7 @@ def transform(self, X): """ self._check_fitted() - if hasattr(X, "loc"): - X_ = X.values - else: - X_ = X + X_, _ = _preprocess(X) return X_[:, self.k_feature_idx_] def fit_transform(self, X, y, groups=None, **fit_params): diff --git a/mlxtend/feature_selection/tests/test_exhaustive_feature_selector.py b/mlxtend/feature_selection/tests/test_exhaustive_feature_selector.py index b0a7ed40e..ccfcd7413 100644 --- a/mlxtend/feature_selection/tests/test_exhaustive_feature_selector.py +++ b/mlxtend/feature_selection/tests/test_exhaustive_feature_selector.py @@ -20,7 +20,7 @@ from mlxtend.utils import assert_raises -def dict_compare_utility(d1, d2): +def dict_compare_utility(d1, d2, decimal=2): assert d1.keys() == d2.keys(), "%s != %s" % (d1, d2) for i in d1: err_msg1 = "d1[%s]['feature_idx']" " != d2[%s]['feature_idx']" % (i, i) @@ -30,13 +30,13 @@ def dict_compare_utility(d1, d2): assert_almost_equal( d1[i]["avg_score"], d2[i]["avg_score"], - decimal=2, + decimal=decimal, err_msg=("d1[%s]['avg_score']" " != d2[%s]['avg_score']" % (i, i)), ) assert_almost_equal( d1[i]["cv_scores"], d2[i]["cv_scores"], - decimal=2, + decimal=decimal, err_msg=("d1[%s]['cv_scores']" " != d2[%s]['cv_scores']" % (i, i)), ) @@ -92,14 +92,14 @@ def test_knn_wo_cv(): 0: { "feature_idx": (0, 1), "feature_names": ("0", "1"), - "avg_score": 0.82666666666666666, - "cv_scores": np.array([0.82666667]), + "avg_score": 0.8333333333333334, + "cv_scores": np.array([0.8333333333333334]), }, 1: { "feature_idx": (0, 2), "feature_names": ("0", "2"), "avg_score": 0.95999999999999996, - "cv_scores": np.array([0.96]), + "cv_scores": np.array([0.95999999999999996]), }, 2: { "feature_idx": (0, 3), @@ -660,8 +660,8 @@ def test_knn_wo_cv_with_fixed_features_and_feature_groups_case1(): 0: { "feature_idx": (0, 1), "feature_names": ("0", "1"), - "avg_score": 0.82666666666666666, - "cv_scores": np.array([0.82666667]), + "avg_score": 0.8333333333333334, + "cv_scores": np.array([0.8333333333333334]), }, 1: { "feature_idx": (0, 1, 2), @@ -781,8 +781,8 @@ def test_check_support_string_in_fixed_feature(): 0: { "feature_idx": (0, 1), "feature_names": (features_names[0], features_names[1]), - "avg_score": 0.82666666666666666, - "cv_scores": np.array([0.82666667]), + "avg_score": 0.8333333333333334, + "cv_scores": np.array([0.8333333333333334]), }, 1: { "feature_idx": (0, 1, 2), diff --git a/mlxtend/feature_selection/tests/test_sequential_feature_selector.py b/mlxtend/feature_selection/tests/test_sequential_feature_selector.py index 6c6fbe586..88cf7a696 100644 --- a/mlxtend/feature_selection/tests/test_sequential_feature_selector.py +++ b/mlxtend/feature_selection/tests/test_sequential_feature_selector.py @@ -10,6 +10,7 @@ from packaging.version import Version from sklearn import __version__ as sklearn_version from sklearn.datasets import load_boston, load_iris +from sklearn.decomposition import PCA from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LinearRegression from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score @@ -31,7 +32,7 @@ def nan_roc_auc_score(y_true, y_score, average="macro", sample_weight=None): ) -def dict_compare_utility(d_actual, d_desired, decimal=3): +def dict_compare_utility(d_actual, d_desired, decimal=2): assert d_actual.keys() == d_desired.keys(), "%s != %s" % (d_actual, d_desired) for i in d_actual: err_msg = "d_actual[%s]['feature_idx']" " != d_desired[%s]['feature_idx']" % ( @@ -83,7 +84,8 @@ def test_kfeatures_type_1(): X = iris.data y = iris.target knn = KNeighborsClassifier() - expect = "k_features must be a positive integer between 1 and X.shape[1]," " got 0" + name = "k_features" + expect = f"{name} must be between 1 and X.shape[1]." sfs = SFS(estimator=knn, verbose=0, k_features=0) assert_raises(AttributeError, expect, sfs.fit, X, y) @@ -103,7 +105,7 @@ def test_kfeatures_type_3(): X = iris.data y = iris.target knn = KNeighborsClassifier() - expect = "k_features tuple min value must be in range(1, X.shape[1]+1)." + expect = "k_features tuple min value must be between 1 and X.shape[1]." sfs = SFS(estimator=knn, verbose=0, k_features=(0, 5)) assert_raises(AttributeError, expect, sfs.fit, X, y) @@ -113,7 +115,7 @@ def test_kfeatures_type_4(): X = iris.data y = iris.target knn = KNeighborsClassifier() - expect = "k_features tuple max value must be in range(1, X.shape[1]+1)." + expect = "k_features tuple max value must be between 1 and X.shape[1]." sfs = SFS(estimator=knn, verbose=0, k_features=(1, 5)) assert_raises(AttributeError, expect, sfs.fit, X, y) @@ -124,7 +126,7 @@ def test_kfeatures_type_5(): y = iris.target knn = KNeighborsClassifier() expect = ( - "The min k_features value must be" " smaller than the max k_features value." + "The min k_features value must be smaller" " than the max k_features value." ) sfs = SFS(estimator=knn, verbose=0, k_features=(3, 1)) assert_raises(AttributeError, expect, sfs.fit, X, y) @@ -880,33 +882,156 @@ def test_check_pandas_dataframe_transform(): assert (150, 2) == sfs1.transform(df).shape -def test_custom_feature_names(): +def test_invalid_estimator(): + expect = "Estimator must have an ._estimator_type for infering `scoring`" + assert_raises(AttributeError, expect, SFS, PCA()) + class PCA2(PCA): + def __init__(self): + super().__init__() + self._estimator_type = "something" + + expect = "Estimator must be a Classifier or Regressor." + assert_raises(AttributeError, expect, SFS, PCA2()) + + +def test_invalid_k_features(): + + iris = load_iris() + X = iris.data + y = iris.target + lr = SoftmaxRegression(random_seed=1) + + sfs1 = SFS(lr, k_features=(1, 2, 3), scoring="accuracy") + expect = "k_features tuple must consist of 2 elements, a min and a max value." + assert_raises(AttributeError, expect, sfs1.fit, X, y) + + sfs1 = SFS(lr, k_features="something", scoring="accuracy") + expect = 'If a string argument is provided, it must be "best" or "parsimonious"' + assert_raises(AttributeError, expect, sfs1.fit, X, y) + + +def test_verbose(): + + iris = load_iris() + X = iris.data + y = iris.target + lr = SoftmaxRegression(random_seed=1) + + sfs1 = SFS(lr, k_features=1, scoring="accuracy", verbose=1) + sfs1.fit(X, y) + + +def test_check_pandas_dataframe_with_feature_groups(): iris = load_iris() X = iris.data y = iris.target lr = SoftmaxRegression(random_seed=1) + + df = pd.DataFrame( + X, columns=["sepal length", "sepal width", "petal length", "petal width"] + ) + sfs1 = SFS( lr, k_features=2, forward=True, floating=False, scoring="accuracy", + feature_groups=[ + ["sepal length", "petal length"], + ["sepal width"], + ["petal width"], + ], cv=0, verbose=0, n_jobs=1, ) - sfs1 = sfs1.fit( - X, - y, - custom_feature_names=( - "sepal length", - "sepal width", - "petal length", - "petal width", - ), + sfs1 = sfs1.fit(df, y) + assert sfs1.k_feature_names_ == ( + "sepal width", + "petal width", + ), sfs1.k_feature_names_ + assert (150, 2) == sfs1.transform(df).shape + + # now, test with different `feature_groups` + sfs1 = SFS( + lr, + k_features=2, # this is num of selected groups to form selected features + forward=True, + floating=False, + scoring="accuracy", + feature_groups=[ + ["petal length", "petal width"], + ["sepal length"], + ["sepal width"], + ], + cv=0, + verbose=0, + n_jobs=1, ) - assert sfs1.k_feature_idx_ == (1, 3) - assert sfs1.k_feature_names_ == ("sepal width", "petal width") - assert sfs1.subsets_[2]["feature_names"] == ("sepal width", "petal width") + + sfs1 = sfs1.fit(df, y) + # the selected fetures are sorted according their corresponding indices + assert sfs1.k_feature_names_ == ( + "sepal width", + "petal length", + "petal width", + ), sfs1.k_feature_names_ + + +def test_check_pandas_dataframe_with_feature_groups_and_fixed_features(): + iris = load_iris() + X = iris.data + y = iris.target + lr = SoftmaxRegression(random_seed=123) + + df = pd.DataFrame( + X, columns=["sepal length", "sepal width", "petal length", "petal width"] + ) + sfs1 = SFS( + lr, + k_features=2, + forward=True, + floating=False, + scoring="accuracy", + feature_groups=[ + ["petal length", "petal width"], + ["sepal length"], + ["sepal width"], + ], + fixed_features=["sepal length", "petal length", "petal width"], + cv=0, + verbose=0, + n_jobs=1, + ) + + sfs1 = sfs1.fit(df, y) + assert sfs1.k_feature_names_ == ( + "sepal length", + "petal length", + "petal width", + ), sfs1.k_feature_names_ + + +def test_check_feature_groups(): + iris = load_iris() + X = iris.data + y = iris.target + lr = SoftmaxRegression(random_seed=123) + sfs1 = SFS( + lr, + k_features=2, + forward=True, + floating=False, + scoring="accuracy", + feature_groups=[[2, 3], [0], [1]], + fixed_features=[0, 2, 3], + cv=0, + verbose=0, + n_jobs=1, + ) + + sfs1 = sfs1.fit(X, y) + assert sfs1.k_feature_idx_ == (0, 2, 3), sfs1.k_feature_idx_ diff --git a/mlxtend/feature_selection/tests/test_sequential_feature_selector_feature_groups.py b/mlxtend/feature_selection/tests/test_sequential_feature_selector_feature_groups.py new file mode 100644 index 000000000..7c802b068 --- /dev/null +++ b/mlxtend/feature_selection/tests/test_sequential_feature_selector_feature_groups.py @@ -0,0 +1,275 @@ +# Sebastian Raschka 2014-2022 +# mlxtend Machine Learning Library Extensions +# Author: Sebastian Raschka +# +# License: BSD 3 clause +import numpy as np +from numpy import nan +from numpy.testing import assert_almost_equal +from sklearn.datasets import load_boston, load_iris +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LinearRegression +from sklearn.metrics import roc_auc_score +from sklearn.neighbors import KNeighborsClassifier + +from mlxtend.feature_selection import SequentialFeatureSelector as SFS +from mlxtend.utils import assert_raises + + +def dict_compare_utility(d_actual, d_desired, decimal=2): + assert d_actual.keys() == d_desired.keys(), "%s != %s" % (d_actual, d_desired) + for i in d_actual: + err_msg = "d_actual[%s]['feature_idx']" " != d_desired[%s]['feature_idx']" % ( + i, + i, + ) + assert d_actual[i]["feature_idx"] == d_desired[i]["feature_idx"], err_msg + assert_almost_equal( + actual=d_actual[i]["avg_score"], + desired=d_desired[i]["avg_score"], + decimal=decimal, + err_msg=( + "d_actual[%s]['avg_score']" " != d_desired[%s]['avg_score']" % (i, i) + ), + ) + assert_almost_equal( + actual=d_actual[i]["cv_scores"], + desired=d_desired[i]["cv_scores"], + decimal=decimal, + err_msg=( + "d_actual[%s]['cv_scores']" " != d_desired[%s]['cv_scores']" % (i, i) + ), + ) + + +def test_run_default(): + iris = load_iris() + X = iris.data + y = iris.target + knn = KNeighborsClassifier() + sfs = SFS(estimator=knn, verbose=0) + sfs.fit(X, y) + assert sfs.k_feature_idx_ == (3,) + + +def test_fit_params(): + iris = load_iris() + X = iris.data + y = iris.target + sample_weight = np.ones(X.shape[0]) + forest = RandomForestClassifier(n_estimators=100, random_state=123) + sfs = SFS(estimator=forest, verbose=0) + sfs.fit(X, y, sample_weight=sample_weight) + assert sfs.k_feature_idx_ == (3,) + + +def test_knn_wo_cv_feature_groups_default(): + iris = load_iris() + X = iris.data + y = iris.target + knn = KNeighborsClassifier(n_neighbors=4) + sfs1 = SFS( + knn, + k_features=3, + forward=True, + floating=False, + cv=0, + verbose=0, + feature_groups=[[0], [1], [2], [3]], + ) + sfs1 = sfs1.fit(X, y) + expect = { + 1: { + "avg_score": 0.95999999999999996, + "cv_scores": np.array([0.96]), + "feature_idx": (3,), + }, + 2: { + "avg_score": 0.97333333333333338, + "cv_scores": np.array([0.97333333]), + "feature_idx": (2, 3), + }, + 3: { + "avg_score": 0.97333333333333338, + "cv_scores": np.array([0.97333333]), + "feature_idx": (1, 2, 3), + }, + } + dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect, decimal=2) + + +def test_regression_sbfs(): + boston = load_boston() + X, y = boston.data, boston.target + lr = LinearRegression() + sfs_r = SFS( + lr, + k_features=1, # this is number of groups if `feature_groups` is not None + forward=False, + floating=True, + scoring="neg_mean_squared_error", + cv=10, + verbose=0, + feature_groups=[[7, 10, 12], [0], [1], [2], [3], [4], [5], [6], [8], [9], [11]], + ) + sfs_r = sfs_r.fit(X, y) + assert sfs_r.k_feature_idx_ == (7, 10, 12), sfs_r.k_feature_idx_ + + +def test_transform_not_fitted(): + iris = load_iris() + X = iris.data + knn = KNeighborsClassifier(n_neighbors=4) + + sfs1 = SFS( + knn, + k_features=2, + forward=True, + floating=False, + cv=0, + clone_estimator=False, + verbose=0, + n_jobs=1, + ) + + expect = "SequentialFeatureSelector has not been fitted, yet." + + assert_raises(AttributeError, expect, sfs1.transform, X) + + +def test_keyboard_interrupt(): + iris = load_iris() + X = iris.data + y = iris.target + + knn = KNeighborsClassifier(n_neighbors=4) + sfs1 = SFS( + knn, + k_features=2, + forward=True, + floating=False, + cv=3, + clone_estimator=False, + verbose=5, + n_jobs=1, + feature_groups=[[0, 1], [2], [3]], + ) + + sfs1._TESTING_INTERRUPT_MODE = True + out = sfs1.fit(X, y) + + assert len(out.subsets_.keys()) > 0 + assert sfs1.interrupted_ + + +def test_max_feature_subset_best(): + boston = load_boston() + X, y = boston.data, boston.target + lr = LinearRegression() + + sfs = SFS( + lr, + k_features="best", + forward=True, + floating=False, + cv=10, + feature_groups=[ + [0], + [2, 4], + [1, 3, 5], + [6], + [7, 8, 9, 10], + [11], + [12], + ], + ) + + sfs = sfs.fit(X, y) + assert sfs.k_feature_idx_ == (1, 3, 5, 7, 8, 9, 10, 11, 12) + + +def test_max_feature_subset_parsimonious(): + boston = load_boston() + X, y = boston.data, boston.target + lr = LinearRegression() + + sfs = SFS( + lr, + k_features="parsimonious", + forward=True, + floating=False, + cv=10, + feature_groups=[ + [0], + [1, 3], + [2, 4], + [5, 10, 11, 12], + [6], + [7], + [8, 9], + ], + ) + + sfs = sfs.fit(X, y) + assert sfs.k_feature_idx_ == (5, 10, 11, 12) + + +def test_knn_wo_cv_with_fixed_features_and_feature_groups_case1(): + # features (0, 1) gives different score? + iris = load_iris() + X = iris.data + y = iris.target + knn = KNeighborsClassifier(n_neighbors=4) + sfs = SFS( + knn, + k_features=(1, 2), + scoring="accuracy", + cv=0, + fixed_features=[0, 1], + feature_groups=[[0, 1], [2], [3]], + ) + sfs.fit(X, y) + # expect is based on what provided in `test_knn_wo_cv` + expect = { + 1: { + "feature_idx": (0, 1), + "feature_names": ("0", "1"), + "avg_score": 0.8333333333333334, + "cv_scores": np.array([0.8333333333333334]), + }, + 2: { + "feature_idx": (0, 1, 3), + "feature_names": ("0", "1", "3"), + "avg_score": 0.96666666666666667, + "cv_scores": np.array([0.96666667]), + }, + } + dict_compare_utility(d_actual=expect, d_desired=sfs.subsets_) + + +def test_knn_wo_cv_with_fixed_features_and_feature_groups_case2(): + # similar to case1, but `fixed_features` is now consisting of two groups + # [0,1] and [3] + iris = load_iris() + X = iris.data + y = iris.target + knn = KNeighborsClassifier(n_neighbors=4) + efs1 = SFS( + knn, + k_features=2, + scoring="accuracy", + cv=0, + fixed_features=[0, 1, 3], + feature_groups=[[0, 1], [2], [3]], + ) + efs1 = efs1.fit(X, y) + # expect is based on what provided in `test_knn_wo_cv` + expect = { + 2: { + "feature_idx": (0, 1, 3), + "feature_names": ("0", "1", "3"), + "avg_score": 0.96666666666666667, + "cv_scores": np.array([0.96666667]), + }, + } + dict_compare_utility(d_actual=expect, d_desired=efs1.subsets_) diff --git a/mlxtend/feature_selection/tests/test_sequential_feature_selector_fixed_features.py b/mlxtend/feature_selection/tests/test_sequential_feature_selector_fixed_features.py index cc98d6346..0858124b1 100644 --- a/mlxtend/feature_selection/tests/test_sequential_feature_selector_fixed_features.py +++ b/mlxtend/feature_selection/tests/test_sequential_feature_selector_fixed_features.py @@ -168,24 +168,15 @@ def test_pandas(): def test_wrong_feature_number(): knn = KNeighborsClassifier() - expect = ( - "Number of features to be selected must be" - " larger than the number of features" - " specified via `fixed_features`. Got" - " `k_features=1` and `fixed_features=2`" - ) - assert_raises(ValueError, expect, SFS, knn, k_features=1, fixed_features=(1, 3)) + expect = "k_features must be between len(fixed_features) and X.shape[1]." + sfs = SFS(knn, k_features=1, fixed_features=(1, 3)) + assert_raises(AttributeError, expect, sfs.fit, X=X_iris, y=y_iris) def test_wrong_feature_range(): knn = KNeighborsClassifier() expect = ( - "The minimum number of features to be " - "selected must be larger than the number of " - "features specified via " - "`fixed_features`. " - "Got `k_features=(1, 3)` and `len(fixed_features)=2`" - ) - assert_raises( - ValueError, expect, SFS, knn, k_features=(1, 3), fixed_features=(1, 3) + "k_features tuple min value must be between len(fixed_features) and X.shape[1]." ) + sfs = SFS(knn, k_features=(1, 3), fixed_features=(1, 3)) + assert_raises(AttributeError, expect, sfs.fit, X=X_iris, y=y_iris) diff --git a/mlxtend/feature_selection/utilities.py b/mlxtend/feature_selection/utilities.py new file mode 100644 index 000000000..4290401e3 --- /dev/null +++ b/mlxtend/feature_selection/utilities.py @@ -0,0 +1,162 @@ +from copy import deepcopy + +import numpy as np +from sklearn.model_selection import cross_val_score + + +def _merge_lists(nested_list, high_level_indices=None): + """ + merge elements of lists (of a nested_list) into one single tuple with elements + sorted in ascending order. + + Parameters + ---------- + nested_list: List + a list whose elements must be list as well. + + high_level_indices: list or tuple, default None + a list or tuple that contains integers that are between 0 (inclusive) and + the length of `nested_lst` (exclusive). If None, the merge of all + lists nested in `nested_list` will be returned. + + Returns + ------- + out: tuple + a tuple, with elements sorted in ascending order, that is the merge of inner + lists whose indices are provided in `high_level_indices` + + Example: + nested_list = [[1],[2, 3],[4]] + high_level_indices = [1, 2] + >>> _merge_lists(nested_list, high_level_indices) + (2, 3, 4) # merging [2, 3] and [4] + """ + if high_level_indices is None: + high_level_indices = list(range(len(nested_list))) + + out = [] + for idx in high_level_indices: + out.extend(nested_list[idx]) + + return tuple(sorted(out)) + + +def _calc_score( + selector, X, y, indices, groups=None, feature_groups=None, **fit_params +): + """ + calculate the cross-validation score for feature data `X` and target variable + `y`. + + Parameters + --------- + selector : objcet with attributes est_` (estimator), `cv` (number of folds + in cross-validation), and `pre_dispatch`() + + X : numpy.ndarray + A 2D array consisting of feature data, where each column corresponds to + one feature, and each row corresponds to one instance (or observation) + + y : numpy.ndarray + A 1D array consiting of tartget values + + indices : list or tuple + A list or tuple of interger numbers. When `feature_groups` is not provided, + i.e. None (default), the values in indices represent the column indices of + X that should be consdered throughout the calculation of cross validation + score. When `feature_groups` is not None, the indices represent the indices + of the groups of features that should be considered through the calculation + of cross-validation score. + + groups : array-like, with shape (n_samples,), optional + Group labels for the samples used while splitting the dataset into + train/test set. Passed to the fit method of the cross-validator. + + feature_groups : list or None (default: None) + Optional argument for treating certain features as a group. + This means, the features within a group are always selected together, + never split. + For example, `feature_groups=[[1], [2], [3, 4, 5]]` + specifies 3 feature groups.e + + fit_params : dict of string -> object, optional + Parameters to pass to to the fit method of classifier. + + Returns + ------- + indices : List or tuple + This is exactly the same as the input `indices` + + scores : array + This is an array of cv scores, with length equal to the cv value. + """ + if feature_groups is None: + feature_groups = [[i] for i in range(X.shape[1])] + + IDX = _merge_lists(feature_groups, indices) + if selector.cv: + scores = cross_val_score( + selector.est_, + X[:, IDX], + y, + groups=groups, + cv=selector.cv, + scoring=selector.scorer, + n_jobs=1, + pre_dispatch=selector.pre_dispatch, + fit_params=fit_params, + ) + else: + selector.est_.fit(X[:, IDX], y, **fit_params) + scores = np.array([selector.scorer(selector.est_, X[:, IDX], y)]) + return indices, scores + + +def _preprocess(X): + """ + Check if X is a DataFrame or not, and returns numpy ndarray and name of features. + + Parameters + ---------- + X : DataFrame or numpy.ndarray + A DataFrame or a 2D numpy.ndarray + + Returns + ------- + X_ : numpy.ndarray + A 2D array that is equivalanet to X.to_numpy() + + features_names : List + A list consisting of name of features. When `X` is a DataFrame, it contains + the name of columns. If it is a 2D array, it is None. + """ + if X.ndim != 2: + raise ValueError(f"The input X must be 2D array. Got {X.ndim}") + + if type(X).__name__ == "DataFrame": + features_names = list(X.columns) + X_ = X.to_numpy(copy=True) + else: + # it is numpy array + features_names = None + X_ = X.copy() + + return X_, features_names + + +def _get_featurenames(subsets_dict, feature_idx, feature_names, n_features): + if feature_names is None: + feature_names = [str(i) for i in range(n_features)] + + dict_keys = subsets_dict.keys() + for key in dict_keys: + subsets_dict[key]["feature_names"] = tuple( + feature_names[idx] for idx in subsets_dict[key]["feature_idx"] + ) + + if feature_idx is None: + feature_idx_names = None + else: + feature_idx_names = tuple(feature_names[idx] for idx in feature_idx) + + return subsets_dict, feature_idx_names